Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorJoão Araújo <jaraujo98@gmail.com>2017-07-26 13:25:24 +0300
committerJoão Araújo <jaraujo98@gmail.com>2017-07-26 13:25:24 +0300
commit59908f5eb73670c97c5bb817290a0dac99089900 (patch)
tree709de097c1fac2ff7b172a8b50dc8a91d7b74860 /intern
parent595f2ca2e06e07acaccc473982bde7a5ed644b50 (diff)
parentedc6bec9d60204cb81d2e7533402630b076d0d32 (diff)
Merge remote-tracking branch 'origin/master' into gsoc2016-improved_extrusiongsoc2016-improved_extrusion
Diffstat (limited to 'intern')
-rw-r--r--intern/audaspace/CMakeLists.txt2
-rw-r--r--intern/cycles/CMakeLists.txt2
-rw-r--r--intern/cycles/app/CMakeLists.txt19
-rw-r--r--intern/cycles/blender/addon/__init__.py3
-rw-r--r--intern/cycles/blender/addon/engine.py46
-rw-r--r--intern/cycles/blender/addon/properties.py138
-rw-r--r--intern/cycles/blender/addon/ui.py92
-rw-r--r--intern/cycles/blender/blender_curves.cpp18
-rw-r--r--intern/cycles/blender/blender_mesh.cpp52
-rw-r--r--intern/cycles/blender/blender_object.cpp28
-rw-r--r--intern/cycles/blender/blender_python.cpp9
-rw-r--r--intern/cycles/blender/blender_session.cpp290
-rw-r--r--intern/cycles/blender/blender_session.h13
-rw-r--r--intern/cycles/blender/blender_shader.cpp13
-rw-r--r--intern/cycles/blender/blender_sync.cpp137
-rw-r--r--intern/cycles/blender/blender_sync.h9
-rw-r--r--intern/cycles/blender/blender_util.h13
-rw-r--r--intern/cycles/bvh/CMakeLists.txt4
-rw-r--r--intern/cycles/bvh/bvh.cpp867
-rw-r--r--intern/cycles/bvh/bvh.h106
-rw-r--r--intern/cycles/bvh/bvh2.cpp364
-rw-r--r--intern/cycles/bvh/bvh2.h87
-rw-r--r--intern/cycles/bvh/bvh4.cpp516
-rw-r--r--intern/cycles/bvh/bvh4.h87
-rw-r--r--intern/cycles/bvh/bvh_binning.cpp4
-rw-r--r--intern/cycles/bvh/bvh_binning.h3
-rw-r--r--intern/cycles/bvh/bvh_build.cpp3
-rw-r--r--intern/cycles/bvh/bvh_build.h6
-rw-r--r--intern/cycles/bvh/bvh_node.cpp3
-rw-r--r--intern/cycles/bvh/bvh_node.h1
-rw-r--r--intern/cycles/bvh/bvh_params.h1
-rw-r--r--intern/cycles/bvh/bvh_sort.cpp3
-rw-r--r--intern/cycles/bvh/bvh_sort.h4
-rw-r--r--intern/cycles/bvh/bvh_split.cpp3
-rw-r--r--intern/cycles/bvh/bvh_unaligned.cpp1
-rw-r--r--intern/cycles/bvh/bvh_unaligned.h1
-rw-r--r--intern/cycles/cmake/external_libs.cmake8
-rw-r--r--intern/cycles/device/CMakeLists.txt2
-rw-r--r--intern/cycles/device/device.cpp16
-rw-r--r--intern/cycles/device/device.h31
-rw-r--r--intern/cycles/device/device_cpu.cpp871
-rw-r--r--intern/cycles/device/device_cuda.cpp549
-rw-r--r--intern/cycles/device/device_denoising.cpp232
-rw-r--r--intern/cycles/device/device_denoising.h148
-rw-r--r--intern/cycles/device/device_memory.h44
-rw-r--r--intern/cycles/device/device_multi.cpp54
-rw-r--r--intern/cycles/device/device_opencl.cpp13
-rw-r--r--intern/cycles/device/device_split_kernel.cpp35
-rw-r--r--intern/cycles/device/device_split_kernel.h12
-rw-r--r--intern/cycles/device/device_task.cpp6
-rw-r--r--intern/cycles/device/device_task.h14
-rw-r--r--intern/cycles/device/opencl/opencl.h83
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp516
-rw-r--r--intern/cycles/device/opencl/opencl_mega.cpp54
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp123
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp81
-rw-r--r--intern/cycles/kernel/CMakeLists.txt150
-rw-r--r--intern/cycles/kernel/closure/bsdf.h79
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h160
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h215
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h164
-rw-r--r--intern/cycles/kernel/closure/bsdf_oren_nayar.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_phong_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_diffuse.h127
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_sheen.h113
-rw-r--r--intern/cycles/kernel/closure/bsdf_toon.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_util.h20
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h40
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h38
-rw-r--r--intern/cycles/kernel/filter/filter_features.h124
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h105
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h186
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h144
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h211
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h117
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h108
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h119
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h105
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h2
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h6
-rw-r--r--intern/cycles/kernel/kernel.h38
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h159
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h12
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h5
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h2
-rw-r--r--intern/cycles/kernel/kernel_globals.h16
-rw-r--r--intern/cycles/kernel/kernel_image_opencl.h63
-rw-r--r--intern/cycles/kernel/kernel_jitter.h21
-rw-r--r--intern/cycles/kernel/kernel_light.h2
-rw-r--r--intern/cycles/kernel/kernel_passes.h217
-rw-r--r--intern/cycles/kernel/kernel_path.h104
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h142
-rw-r--r--intern/cycles/kernel/kernel_path_state.h16
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h25
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h8
-rw-r--r--intern/cycles/kernel/kernel_projection.h3
-rw-r--r--intern/cycles/kernel/kernel_queues.h15
-rw-r--r--intern/cycles/kernel/kernel_random.h238
-rw-r--r--intern/cycles/kernel/kernel_shader.h6
-rw-r--r--intern/cycles/kernel/kernel_shadow.h2
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h58
-rw-r--r--intern/cycles/kernel/kernel_textures.h160
-rw-r--r--intern/cycles/kernel/kernel_types.h206
-rw-r--r--intern/cycles/kernel/kernel_volume.h2
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h138
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h272
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel.cpp42
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx.cpp30
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h5
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h82
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h161
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp29
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu255
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu10
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl280
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl15
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_path_init.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl11
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl27
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split.cl3
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_function.h72
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl11
-rw-r--r--intern/cycles/kernel/osl/osl_bssrdf.cpp31
-rw-r--r--intern/cycles/kernel/osl/osl_closures.cpp268
-rw-r--r--intern/cycles/kernel/osl/osl_closures.h14
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp2
-rw-r--r--intern/cycles/kernel/shaders/CMakeLists.txt4
-rw-r--r--intern/cycles/kernel/shaders/node_principled_bsdf.osl120
-rw-r--r--intern/cycles/kernel/shaders/stdosl.h9
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h220
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h16
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h25
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h66
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h190
-rw-r--r--intern/cycles/kernel/split/kernel_enqueue_inactive.h46
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h123
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_background.h8
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_subsurface.h37
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h222
-rw-r--r--intern/cycles/kernel/split/kernel_queue_enqueue.h3
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h16
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h69
-rw-r--r--intern/cycles/kernel/split/kernel_shader_setup.h70
-rw-r--r--intern/cycles/kernel/split/kernel_shader_sort.h97
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_ao.h42
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h87
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h74
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h18
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h100
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h334
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h343
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h9
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h1
-rw-r--r--intern/cycles/kernel/svm/svm_image.h222
-rw-r--r--intern/cycles/kernel/svm/svm_types.h30
-rw-r--r--intern/cycles/kernel/svm/svm_voxel.h7
-rw-r--r--intern/cycles/render/buffers.cpp63
-rw-r--r--intern/cycles/render/buffers.h15
-rw-r--r--intern/cycles/render/constant_fold.cpp8
-rw-r--r--intern/cycles/render/film.cpp22
-rw-r--r--intern/cycles/render/film.h7
-rw-r--r--intern/cycles/render/graph.cpp64
-rw-r--r--intern/cycles/render/graph.h3
-rw-r--r--intern/cycles/render/image.cpp601
-rw-r--r--intern/cycles/render/image.h46
-rw-r--r--intern/cycles/render/light.cpp15
-rw-r--r--intern/cycles/render/mesh.cpp3
-rw-r--r--intern/cycles/render/mesh_displace.cpp2
-rw-r--r--intern/cycles/render/nodes.cpp174
-rw-r--r--intern/cycles/render/nodes.h46
-rw-r--r--intern/cycles/render/osl.cpp1
-rw-r--r--intern/cycles/render/scene.h16
-rw-r--r--intern/cycles/render/session.cpp242
-rw-r--r--intern/cycles/render/session.h24
-rw-r--r--intern/cycles/render/shader.cpp34
-rw-r--r--intern/cycles/render/shader.h9
-rw-r--r--intern/cycles/render/tile.cpp199
-rw-r--r--intern/cycles/render/tile.h33
-rw-r--r--intern/cycles/test/util_string_test.cpp37
-rw-r--r--intern/cycles/util/CMakeLists.txt33
-rw-r--r--intern/cycles/util/util_atomic.h4
-rw-r--r--intern/cycles/util/util_color.h43
-rw-r--r--intern/cycles/util/util_debug.cpp6
-rw-r--r--intern/cycles/util/util_debug.h4
-rw-r--r--intern/cycles/util/util_guarded_allocator.h2
-rw-r--r--intern/cycles/util/util_logging.cpp4
-rw-r--r--intern/cycles/util/util_logging.h16
-rw-r--r--intern/cycles/util/util_math.h1182
-rw-r--r--intern/cycles/util/util_math_float2.h227
-rw-r--r--intern/cycles/util/util_math_float3.h385
-rw-r--r--intern/cycles/util/util_math_float4.h393
-rw-r--r--intern/cycles/util/util_math_int2.h77
-rw-r--r--intern/cycles/util/util_math_int3.h83
-rw-r--r--intern/cycles/util/util_math_int4.h119
-rw-r--r--intern/cycles/util/util_math_matrix.h404
-rw-r--r--intern/cycles/util/util_path.cpp57
-rw-r--r--intern/cycles/util/util_progress.h48
-rw-r--r--intern/cycles/util/util_simd.h61
-rw-r--r--intern/cycles/util/util_string.cpp6
-rw-r--r--intern/cycles/util/util_task.cpp4
-rw-r--r--intern/cycles/util/util_texture.h78
-rw-r--r--intern/cycles/util/util_types.h536
-rw-r--r--intern/cycles/util/util_types_float2.h40
-rw-r--r--intern/cycles/util/util_types_float2_impl.h59
-rw-r--r--intern/cycles/util/util_types_float3.h57
-rw-r--r--intern/cycles/util/util_types_float3_impl.h105
-rw-r--r--intern/cycles/util/util_types_float4.h61
-rw-r--r--intern/cycles/util/util_types_float4_impl.h117
-rw-r--r--intern/cycles/util/util_types_int2.h39
-rw-r--r--intern/cycles/util/util_types_int2_impl.h50
-rw-r--r--intern/cycles/util/util_types_int3.h57
-rw-r--r--intern/cycles/util/util_types_int3_impl.h106
-rw-r--r--intern/cycles/util/util_types_int4.h61
-rw-r--r--intern/cycles/util/util_types_int4_impl.h115
-rw-r--r--intern/cycles/util/util_types_uchar2.h39
-rw-r--r--intern/cycles/util/util_types_uchar2_impl.h50
-rw-r--r--intern/cycles/util/util_types_uchar3.h39
-rw-r--r--intern/cycles/util/util_types_uchar3_impl.h50
-rw-r--r--intern/cycles/util/util_types_uchar4.h39
-rw-r--r--intern/cycles/util/util_types_uchar4_impl.h50
-rw-r--r--intern/cycles/util/util_types_uint2.h39
-rw-r--r--intern/cycles/util/util_types_uint2_impl.h48
-rw-r--r--intern/cycles/util/util_types_uint3.h39
-rw-r--r--intern/cycles/util/util_types_uint3_impl.h48
-rw-r--r--intern/cycles/util/util_types_uint4.h39
-rw-r--r--intern/cycles/util/util_types_uint4_impl.h48
-rw-r--r--intern/cycles/util/util_types_vector3.h41
-rw-r--r--intern/cycles/util/util_types_vector3_impl.h47
-rw-r--r--intern/dualcon/intern/Projections.h2
-rw-r--r--intern/dualcon/intern/dualcon_c_api.cpp2
-rw-r--r--intern/elbeem/intern/isosurface.cpp33
-rw-r--r--intern/elbeem/intern/mvmcoords.h2
-rw-r--r--intern/elbeem/intern/solver_util.cpp4
-rw-r--r--intern/ffmpeg/ffmpeg_compat.h9
-rw-r--r--intern/ghost/GHOST_C-api.h2
-rw-r--r--intern/ghost/intern/GHOST_Context.cpp2
-rw-r--r--intern/ghost/intern/GHOST_DisplayManagerWin32.cpp2
-rw-r--r--intern/ghost/intern/GHOST_SystemCocoa.h5
-rw-r--r--intern/ghost/intern/GHOST_SystemCocoa.mm48
-rw-r--r--intern/ghost/intern/GHOST_SystemPathsWin32.cpp24
-rw-r--r--intern/ghost/intern/GHOST_SystemWin32.cpp22
-rw-r--r--intern/ghost/intern/GHOST_SystemWin32.h8
-rw-r--r--intern/ghost/intern/GHOST_TaskbarWin32.h8
-rw-r--r--intern/ghost/intern/GHOST_WindowCocoa.h2
-rw-r--r--intern/ghost/intern/GHOST_WindowCocoa.mm4
-rw-r--r--intern/ghost/intern/GHOST_WindowWin32.cpp9
-rw-r--r--intern/guardedalloc/intern/mallocn_intern.h8
-rw-r--r--intern/guardedalloc/intern/mallocn_lockfree_impl.c4
-rw-r--r--intern/libmv/CMakeLists.txt3
-rw-r--r--intern/libmv/ChangeLog389
-rwxr-xr-xintern/libmv/bundle.sh3
-rw-r--r--intern/libmv/intern/frame_accessor.cc58
-rw-r--r--intern/libmv/intern/frame_accessor.h15
-rw-r--r--intern/libmv/intern/logging.cc26
-rw-r--r--intern/libmv/intern/stub.cc4
-rw-r--r--intern/libmv/libmv/autotrack/autotrack.cc23
-rw-r--r--intern/libmv/libmv/autotrack/frame_accessor.h19
-rw-r--r--intern/libmv/libmv/autotrack/predict_tracks.cc2
-rw-r--r--intern/locale/CMakeLists.txt19
-rw-r--r--intern/locale/msgfmt.cc374
-rw-r--r--intern/memutil/MEM_CacheLimiterC-Api.h24
-rw-r--r--intern/opencolorio/CMakeLists.txt2
-rw-r--r--intern/opensubdiv/opensubdiv_capi.cc70
-rw-r--r--intern/opensubdiv/opensubdiv_capi.h2
-rw-r--r--intern/string/STR_HashedString.h28
297 files changed, 16739 insertions, 6243 deletions
diff --git a/intern/audaspace/CMakeLists.txt b/intern/audaspace/CMakeLists.txt
index 2d415296dac..dd446613fd0 100644
--- a/intern/audaspace/CMakeLists.txt
+++ b/intern/audaspace/CMakeLists.txt
@@ -19,7 +19,7 @@
#
# ***** END LGPL LICENSE BLOCK *****
-remove_extra_strict_flags()
+remove_strict_flags()
if(CMAKE_COMPILER_IS_GNUCC)
remove_cc_flag("-Wunused-macros")
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 806a8660e8c..c53a9f91cc0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -191,7 +191,7 @@ endif()
# Logging capabilities using GLog library.
if(WITH_CYCLES_LOGGING)
add_definitions(-DWITH_CYCLES_LOGGING)
- add_definitions(-DGOOGLE_GLOG_DLL_DECL=)
+ add_definitions(${GLOG_DEFINES})
add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE})
include_directories(
SYSTEM
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index aabb8f63640..08a3931ef46 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -35,18 +35,15 @@ if(WITH_CYCLES_OSL)
list(APPEND LIBRARIES cycles_kernel_osl)
endif()
-if(CYCLES_STANDALONE_REPOSITORY)
- if(WITH_CYCLES_LOGGING)
- list(APPEND LIBRARIES
- ${GLOG_LIBRARIES}
- ${GFLAGS_LIBRARIES}
- )
- endif()
-else()
+if(NOT CYCLES_STANDALONE_REPOSITORY)
list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc)
- if(WITH_CYCLES_LOGGING)
- list(APPEND LIBRARIES extern_glog extern_gflags)
- endif()
+endif()
+
+if(WITH_CYCLES_LOGGING)
+ list(APPEND LIBRARIES
+ ${GLOG_LIBRARIES}
+ ${GFLAGS_LIBRARIES}
+ )
endif()
if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index eb792af7264..a2d6262fb20 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -102,6 +102,9 @@ class CyclesRender(bpy.types.RenderEngine):
else:
self.report({'ERROR'}, "OSL support disabled in this build.")
+ def update_render_passes(self, scene, srl):
+ engine.register_passes(self, scene, srl)
+
def engine_exit():
engine.exit()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index ab57dd44bdb..3018fd5b316 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -205,3 +205,49 @@ def with_network():
def system_info():
import _cycles
return _cycles.system_info()
+
+def register_passes(engine, scene, srl):
+ engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR')
+
+ if srl.use_pass_z: engine.register_pass(scene, srl, "Depth", 1, "Z", 'VALUE')
+ if srl.use_pass_mist: engine.register_pass(scene, srl, "Mist", 1, "Z", 'VALUE')
+ if srl.use_pass_normal: engine.register_pass(scene, srl, "Normal", 3, "XYZ", 'VECTOR')
+ if srl.use_pass_vector: engine.register_pass(scene, srl, "Vector", 4, "XYZW", 'VECTOR')
+ if srl.use_pass_uv: engine.register_pass(scene, srl, "UV", 3, "UVA", 'VECTOR')
+ if srl.use_pass_object_index: engine.register_pass(scene, srl, "IndexOB", 1, "X", 'VALUE')
+ if srl.use_pass_material_index: engine.register_pass(scene, srl, "IndexMA", 1, "X", 'VALUE')
+ if srl.use_pass_shadow: engine.register_pass(scene, srl, "Shadow", 3, "RGB", 'COLOR')
+ if srl.use_pass_ambient_occlusion: engine.register_pass(scene, srl, "AO", 3, "RGB", 'COLOR')
+ if srl.use_pass_diffuse_direct: engine.register_pass(scene, srl, "DiffDir", 3, "RGB", 'COLOR')
+ if srl.use_pass_diffuse_indirect: engine.register_pass(scene, srl, "DiffInd", 3, "RGB", 'COLOR')
+ if srl.use_pass_diffuse_color: engine.register_pass(scene, srl, "DiffCol", 3, "RGB", 'COLOR')
+ if srl.use_pass_glossy_direct: engine.register_pass(scene, srl, "GlossDir", 3, "RGB", 'COLOR')
+ if srl.use_pass_glossy_indirect: engine.register_pass(scene, srl, "GlossInd", 3, "RGB", 'COLOR')
+ if srl.use_pass_glossy_color: engine.register_pass(scene, srl, "GlossCol", 3, "RGB", 'COLOR')
+ if srl.use_pass_transmission_direct: engine.register_pass(scene, srl, "TransDir", 3, "RGB", 'COLOR')
+ if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd", 3, "RGB", 'COLOR')
+ if srl.use_pass_transmission_color: engine.register_pass(scene, srl, "TransCol", 3, "RGB", 'COLOR')
+ if srl.use_pass_subsurface_direct: engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB", 'COLOR')
+ if srl.use_pass_subsurface_indirect: engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB", 'COLOR')
+ if srl.use_pass_subsurface_color: engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB", 'COLOR')
+ if srl.use_pass_emit: engine.register_pass(scene, srl, "Emit", 3, "RGB", 'COLOR')
+ if srl.use_pass_environment: engine.register_pass(scene, srl, "Env", 3, "RGB", 'COLOR')
+
+ crl = srl.cycles
+ if crl.pass_debug_bvh_traversed_nodes: engine.register_pass(scene, srl, "Debug BVH Traversed Nodes", 1, "X", 'VALUE')
+ if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
+ if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
+ if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
+
+ cscene = scene.cycles
+ if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
+ engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR')
+ engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
+ engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE')
+ engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE')
+ engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR')
+ engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR')
+ engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index cbf469b3a89..68474529ed3 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -695,10 +695,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
update=devices_update_callback
)
- cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
+ cls.debug_opencl_kernel_single_program = BoolProperty(
+ name="Single Program",
+ default=True,
+ update=devices_update_callback,
+ )
cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
+ cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0,
+ description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)")
+
@classmethod
def unregister(cls):
del bpy.types.Scene.cycles
@@ -1166,6 +1173,125 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
def unregister(cls):
del bpy.types.Scene.cycles_curves
+def update_render_passes(self, context):
+ scene = context.scene
+ rd = scene.render
+ rl = rd.layers.active
+ rl.update_render_passes()
+
+class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
+ @classmethod
+ def register(cls):
+ bpy.types.SceneRenderLayer.cycles = PointerProperty(
+ name="Cycles SceneRenderLayer Settings",
+ description="Cycles SceneRenderLayer Settings",
+ type=cls,
+ )
+ cls.pass_debug_bvh_traversed_nodes = BoolProperty(
+ name="Debug BVH Traversed Nodes",
+ description="Store Debug BVH Traversed Nodes pass",
+ default=False,
+ update=update_render_passes,
+ )
+ cls.pass_debug_bvh_traversed_instances = BoolProperty(
+ name="Debug BVH Traversed Instances",
+ description="Store Debug BVH Traversed Instances pass",
+ default=False,
+ update=update_render_passes,
+ )
+ cls.pass_debug_bvh_intersections = BoolProperty(
+ name="Debug BVH Intersections",
+ description="Store Debug BVH Intersections",
+ default=False,
+ update=update_render_passes,
+ )
+ cls.pass_debug_ray_bounces = BoolProperty(
+ name="Debug Ray Bounces",
+ description="Store Debug Ray Bounces pass",
+ default=False,
+ update=update_render_passes,
+ )
+
+ cls.use_denoising = BoolProperty(
+ name="Use Denoising",
+ description="Denoise the rendered image",
+ default=False,
+ update=update_render_passes,
+ )
+ cls.denoising_diffuse_direct = BoolProperty(
+ name="Diffuse Direct",
+ description="Denoise the direct diffuse lighting",
+ default=True,
+ )
+ cls.denoising_diffuse_indirect = BoolProperty(
+ name="Diffuse Indirect",
+ description="Denoise the indirect diffuse lighting",
+ default=True,
+ )
+ cls.denoising_glossy_direct = BoolProperty(
+ name="Glossy Direct",
+ description="Denoise the direct glossy lighting",
+ default=True,
+ )
+ cls.denoising_glossy_indirect = BoolProperty(
+ name="Glossy Indirect",
+ description="Denoise the indirect glossy lighting",
+ default=True,
+ )
+ cls.denoising_transmission_direct = BoolProperty(
+ name="Transmission Direct",
+ description="Denoise the direct transmission lighting",
+ default=True,
+ )
+ cls.denoising_transmission_indirect = BoolProperty(
+ name="Transmission Indirect",
+ description="Denoise the indirect transmission lighting",
+ default=True,
+ )
+ cls.denoising_subsurface_direct = BoolProperty(
+ name="Subsurface Direct",
+ description="Denoise the direct subsurface lighting",
+ default=True,
+ )
+ cls.denoising_subsurface_indirect = BoolProperty(
+ name="Subsurface Indirect",
+ description="Denoise the indirect subsurface lighting",
+ default=True,
+ )
+ cls.denoising_strength = FloatProperty(
+ name="Denoising Strength",
+ description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+ min=0.0, max=1.0,
+ default=0.5,
+ )
+ cls.denoising_feature_strength = FloatProperty(
+ name="Denoising Feature Strength",
+ description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+ min=0.0, max=1.0,
+ default=0.5,
+ )
+ cls.denoising_radius = IntProperty(
+ name="Denoising Radius",
+ description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+ min=1, max=25,
+ default=8,
+ )
+ cls.denoising_relative_pca = BoolProperty(
+ name="Relative filter",
+ description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+ default=False,
+ )
+ cls.denoising_store_passes = BoolProperty(
+ name="Store denoising passes",
+ description="Store the denoising feature passes and the noisy image",
+ default=False,
+ update=update_render_passes,
+ )
+
+ @classmethod
+ def unregister(cls):
+ del bpy.types.SceneRenderLayer.cycles
+
class CyclesCurveSettings(bpy.types.PropertyGroup):
@classmethod
@@ -1297,14 +1423,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
row = layout.row()
if self.compute_device_type == 'CUDA' and cuda_devices:
- col = row.column(align=True)
+ box = row.box()
for device in cuda_devices:
- col.prop(device, "use", text=device.name, toggle=True)
+ box.prop(device, "use", text=device.name)
if self.compute_device_type == 'OPENCL' and opencl_devices:
- col = row.column(align=True)
+ box = row.box()
for device in opencl_devices:
- col.prop(device, "use", text=device.name, toggle=True)
+ box.prop(device, "use", text=device.name)
def draw(self, context):
@@ -1324,6 +1450,7 @@ def register():
bpy.utils.register_class(CyclesCurveSettings)
bpy.utils.register_class(CyclesDeviceSettings)
bpy.utils.register_class(CyclesPreferences)
+ bpy.utils.register_class(CyclesRenderLayerSettings)
def unregister():
@@ -1339,3 +1466,4 @@ def unregister():
bpy.utils.unregister_class(CyclesCurveSettings)
bpy.utils.unregister_class(CyclesDeviceSettings)
bpy.utils.unregister_class(CyclesPreferences)
+ bpy.utils.unregister_class(CyclesRenderLayerSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 2b50d272be8..49beebe5ab4 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -78,7 +78,7 @@ def use_cuda(context):
def use_branched_path(context):
cscene = context.scene.cycles
- return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+ return (cscene.progressive == 'BRANCHED_PATH')
def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
row = layout.row()
sub = row.row()
- sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
sub.prop(cscene, "progressive", text="")
row.prop(cscene, "use_square_samples")
@@ -204,8 +203,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
col.prop(cscene, "sample_all_lights_direct")
col.prop(cscene, "sample_all_lights_indirect")
- if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
- layout.row().prop(cscene, "sampling_pattern", text="Pattern")
+ layout.row().prop(cscene, "sampling_pattern", text="Pattern")
for rl in scene.render.layers:
if rl.samples > 0:
@@ -478,11 +476,14 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
bl_options = {'DEFAULT_CLOSED'}
def draw(self, context):
+ import _cycles
+
layout = self.layout
scene = context.scene
rd = scene.render
rl = rd.layers.active
+ crl = rl.cycles
split = layout.split()
@@ -529,8 +530,18 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
col.prop(rl, "use_pass_emit", text="Emission")
col.prop(rl, "use_pass_environment")
- if hasattr(rd, "debug_pass_type"):
- layout.prop(rd, "debug_pass_type")
+ if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+ col.separator()
+ sub = col.column()
+ sub.active = crl.use_denoising
+ sub.prop(crl, "denoising_store_passes", text="Denoising")
+
+ if _cycles.with_cycles_debug:
+ col = layout.column()
+ col.prop(crl, "pass_debug_bvh_traversed_nodes")
+ col.prop(crl, "pass_debug_bvh_traversed_instances")
+ col.prop(crl, "pass_debug_bvh_intersections")
+ col.prop(crl, "pass_debug_ray_bounces")
class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
@@ -576,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
row.prop(rv, "camera_suffix", text="")
+class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+ bl_label = "Denoising"
+ bl_context = "render_layer"
+ bl_options = {'DEFAULT_CLOSED'}
+
+ def draw_header(self, context):
+ rd = context.scene.render
+ rl = rd.layers.active
+ crl = rl.cycles
+ cscene = context.scene.cycles
+ layout = self.layout
+
+ layout.active = not cscene.use_progressive_refine
+ layout.prop(crl, "use_denoising", text="")
+
+ def draw(self, context):
+ layout = self.layout
+
+ scene = context.scene
+ cscene = scene.cycles
+ rd = scene.render
+ rl = rd.layers.active
+ crl = rl.cycles
+
+ layout.active = crl.use_denoising and not cscene.use_progressive_refine
+
+ split = layout.split()
+
+ col = split.column()
+ sub = col.column(align=True)
+ sub.prop(crl, "denoising_radius", text="Radius")
+ sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+ col = split.column()
+ sub = col.column(align=True)
+ sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+ sub.prop(crl, "denoising_relative_pca")
+
+ layout.separator()
+
+ row = layout.row()
+ row.label(text="Diffuse:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Glossy:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Transmission:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Subsurface:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
bl_label = "Post Processing"
bl_options = {'DEFAULT_CLOSED'}
@@ -1532,6 +1608,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
col.prop(cscene, "debug_opencl_device_type", text="Device")
col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
col.prop(cscene, "debug_use_opencl_debug", text="Debug")
+ col.prop(cscene, "debug_opencl_mem_limit")
class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
@@ -1634,7 +1711,7 @@ def draw_device(self, context):
layout.prop(cscene, "feature_set")
- split = layout.split(percentage=1/3)
+ split = layout.split(percentage=1 / 3)
split.label("Device:")
row = split.row()
row.active = show_device_active(context)
@@ -1729,6 +1806,7 @@ classes = (
CyclesRender_PT_layer_options,
CyclesRender_PT_layer_passes,
CyclesRender_PT_views,
+ CyclesRender_PT_denoising,
Cycles_PT_post_processing,
CyclesCamera_PT_dof,
Cycles_PT_context_material,
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 6fa038e8bf0..42b985305ea 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -411,7 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
}
}
- mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
+ mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
mesh->add_face_normals();
@@ -546,7 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
}
}
- mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
+ mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
mesh->add_face_normals();
@@ -776,17 +776,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData,
for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
for(int section = 0; section < resol; section++) {
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
- cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
vertexindex++;
}
}
@@ -1004,7 +1004,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++)
if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f))
- fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]);
+ fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]);
}
}
}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 54571b1fea1..b4cca5f00f4 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -14,7 +14,6 @@
* limitations under the License.
*/
-
#include "render/mesh.h"
#include "render/object.h"
#include "render/scene.h"
@@ -51,8 +50,7 @@ enum {
* Two triangles has vertex indices in the original Blender-side face.
* If face is already a quad tri_b will not be initialized.
*/
-inline void face_split_tri_indices(const int num_verts,
- const int face_flag,
+inline void face_split_tri_indices(const int face_flag,
int tri_a[3],
int tri_b[3])
{
@@ -60,21 +58,19 @@ inline void face_split_tri_indices(const int num_verts,
tri_a[0] = 0;
tri_a[1] = 1;
tri_a[2] = 3;
- if(num_verts == 4) {
- tri_b[0] = 2;
- tri_b[1] = 3;
- tri_b[2] = 1;
- }
+
+ tri_b[0] = 2;
+ tri_b[1] = 3;
+ tri_b[2] = 1;
}
else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ {
tri_a[0] = 0;
tri_a[1] = 1;
tri_a[2] = 2;
- if(num_verts == 4) {
- tri_b[0] = 0;
- tri_b[1] = 2;
- tri_b[2] = 3;
- }
+
+ tri_b[0] = 0;
+ tri_b[1] = 2;
+ tri_b[2] = 3;
}
}
@@ -251,7 +247,7 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh,
for(int i = 0; i < nverts.size(); i++) {
int tri_a[3], tri_b[3];
- face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+ face_split_tri_indices(face_flags[i], tri_a, tri_b);
tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]);
tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]);
@@ -293,7 +289,7 @@ static void create_mesh_volume_attribute(BL::Object& b_ob,
if(!b_domain)
return;
-
+
Attribute *attr = mesh->attributes.add(std);
VoxelAttribute *volume_data = attr->data_voxel();
bool is_float, is_linear;
@@ -356,7 +352,7 @@ static void attr_create_vertex_color(Scene *scene,
int n = p->loop_total();
for(int i = 0; i < n; i++) {
float3 color = get_float3(l->data[p->loop_start() + i].color());
- *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color));
+ *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color));
}
}
}
@@ -377,14 +373,14 @@ static void attr_create_vertex_color(Scene *scene,
for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
int tri_a[3], tri_b[3];
- face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+ face_split_tri_indices(face_flags[i], tri_a, tri_b);
uchar4 colors[4];
- colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
- colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
- colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+ colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1())));
+ colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2())));
+ colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3())));
if(nverts[i] == 4) {
- colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+ colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4())));
}
cdata[0] = colors[tri_a[0]];
@@ -470,7 +466,7 @@ static void attr_create_uv_map(Scene *scene,
for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
int tri_a[3], tri_b[3];
- face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+ face_split_tri_indices(face_flags[i], tri_a, tri_b);
float3 uvs[4];
uvs[0] = get_float3(t->uv1());
@@ -982,7 +978,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
else
used_shaders.push_back(scene->default_surface);
}
-
+
/* test if we need to sync */
int requested_geometry_flags = Mesh::GEOMETRY_NONE;
if(render_layer.use_surfaces) {
@@ -1017,12 +1013,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
/* ensure we only sync instanced meshes once */
if(mesh_synced.find(mesh) != mesh_synced.end())
return mesh;
-
+
mesh_synced.insert(mesh);
/* create derived mesh */
array<int> oldtriangle = mesh->triangles;
-
+
/* compares curve_keys rather than strands in order to handle quick hair
* adjustments in dynamic BVH - other methods could probably do this better*/
array<float3> oldcurve_keys = mesh->curve_keys;
@@ -1111,7 +1107,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0)
rebuild = true;
}
-
+
mesh->tag_update(scene, rebuild);
return mesh;
@@ -1140,7 +1136,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
if(scene->need_motion() == Scene::MOTION_BLUR) {
if(!mesh->use_motion_blur)
return;
-
+
/* see if this mesh needs motion data at this time */
vector<float> object_times = object->motion_times();
bool found = false;
@@ -1172,7 +1168,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
if(!numverts && !numkeys)
return;
-
+
/* skip objects without deforming modifiers. this is not totally reliable,
* would need a more extensive check to see which objects are animated */
BL::Mesh b_mesh(PointerRNA_NULL);
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index d05699236cc..a930c439370 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -379,27 +379,16 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
}
}
- /* random number */
- object->random_id = hash_string(object->name.c_str());
-
- if(persistent_id) {
- for(int i = 0; i < OBJECT_PERSISTENT_ID_SIZE; i++)
- object->random_id = hash_int_2d(object->random_id, persistent_id[i]);
- }
- else
- object->random_id = hash_int_2d(object->random_id, 0);
-
- if(b_parent.ptr.data != b_ob.ptr.data)
- object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
-
- /* dupli texture coordinates */
+ /* dupli texture coordinates and random_id */
if(b_dupli_ob) {
object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f);
object->dupli_uv = get_float2(b_dupli_ob.uv());
+ object->random_id = b_dupli_ob.random_id();
}
else {
object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f);
object->dupli_uv = make_float2(0.0f, 0.0f);
+ object->random_id = hash_int_2d(hash_string(object->name.c_str()), 0);
}
object->tag_update(scene);
@@ -489,7 +478,7 @@ static bool object_render_hide_duplis(BL::Object& b_ob)
/* Object Loop */
-void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
+void BlenderSync::sync_objects(float motion_time)
{
/* layer data */
uint scene_layer = render_layer.scene_layer;
@@ -517,7 +506,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
* 1 : DAG_EVAL_PREVIEW
* 2 : DAG_EVAL_RENDER
*/
- int dupli_settings = preview ? 1 : 2;
+ int dupli_settings = (render_layer.use_viewport_visibility) ? 1 : 2;
bool cancel = false;
bool use_portal = false;
@@ -552,7 +541,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) {
Transform tfm = get_transform(b_dup->matrix());
BL::Object b_dup_ob = b_dup->object();
- bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render();
+ bool dup_hide = (render_layer.use_viewport_visibility)? b_dup_ob.hide(): b_dup_ob.hide_render();
bool in_dupli_group = (b_dup->type() == BL::DupliObject::type_GROUP);
bool hide_tris;
@@ -628,7 +617,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
}
void BlenderSync::sync_motion(BL::RenderSettings& b_render,
- BL::SpaceView3D& b_v3d,
BL::Object& b_override,
int width, int height,
void **python_thread_state)
@@ -665,7 +653,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
b_engine.frame_set(frame, subframe);
python_thread_state_save(python_thread_state);
sync_camera_motion(b_render, b_cam, width, height, 0.0f);
- sync_objects(b_v3d, 0.0f);
+ sync_objects(0.0f);
}
/* always sample these times for camera motion */
@@ -699,7 +687,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
}
/* sync object */
- sync_objects(b_v3d, relative_time);
+ sync_objects(relative_time);
}
/* we need to set the python thread state again because this
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index d509e9de981..54973fd1b7f 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -106,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
}
/* Synchronize other OpenCL flags. */
flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+ flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024;
flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
return flags.opencl.device_type != opencl_device_type ||
flags.opencl.kernel_type != opencl_kernel_type;
@@ -811,6 +812,14 @@ void *CCL_python_module_init()
PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
#endif
+#ifdef WITH_CYCLES_DEBUG
+ PyModule_AddObject(mod, "with_cycles_debug", Py_True);
+ Py_INCREF(Py_True);
+#else
+ PyModule_AddObject(mod, "with_cycles_debug", Py_False);
+ Py_INCREF(Py_False);
+#endif
+
#ifdef WITH_NETWORK
PyModule_AddObject(mod, "with_network", Py_True);
Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 26f9bccd95d..12de3da063f 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -129,9 +129,9 @@ void BlenderSession::create_session()
scene = new Scene(scene_params, session_params.device);
/* setup callbacks for builtin image support */
- scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
- scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
- scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
+ scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7, _8);
+ scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5);
+ scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5);
/* create session */
session = new Session(session_params);
@@ -243,90 +243,6 @@ void BlenderSession::free_session()
delete session;
}
-static PassType get_pass_type(BL::RenderPass& b_pass)
-{
- switch(b_pass.type()) {
- case BL::RenderPass::type_COMBINED:
- return PASS_COMBINED;
-
- case BL::RenderPass::type_Z:
- return PASS_DEPTH;
- case BL::RenderPass::type_MIST:
- return PASS_MIST;
- case BL::RenderPass::type_NORMAL:
- return PASS_NORMAL;
- case BL::RenderPass::type_OBJECT_INDEX:
- return PASS_OBJECT_ID;
- case BL::RenderPass::type_UV:
- return PASS_UV;
- case BL::RenderPass::type_VECTOR:
- return PASS_MOTION;
- case BL::RenderPass::type_MATERIAL_INDEX:
- return PASS_MATERIAL_ID;
-
- case BL::RenderPass::type_DIFFUSE_DIRECT:
- return PASS_DIFFUSE_DIRECT;
- case BL::RenderPass::type_GLOSSY_DIRECT:
- return PASS_GLOSSY_DIRECT;
- case BL::RenderPass::type_TRANSMISSION_DIRECT:
- return PASS_TRANSMISSION_DIRECT;
- case BL::RenderPass::type_SUBSURFACE_DIRECT:
- return PASS_SUBSURFACE_DIRECT;
-
- case BL::RenderPass::type_DIFFUSE_INDIRECT:
- return PASS_DIFFUSE_INDIRECT;
- case BL::RenderPass::type_GLOSSY_INDIRECT:
- return PASS_GLOSSY_INDIRECT;
- case BL::RenderPass::type_TRANSMISSION_INDIRECT:
- return PASS_TRANSMISSION_INDIRECT;
- case BL::RenderPass::type_SUBSURFACE_INDIRECT:
- return PASS_SUBSURFACE_INDIRECT;
-
- case BL::RenderPass::type_DIFFUSE_COLOR:
- return PASS_DIFFUSE_COLOR;
- case BL::RenderPass::type_GLOSSY_COLOR:
- return PASS_GLOSSY_COLOR;
- case BL::RenderPass::type_TRANSMISSION_COLOR:
- return PASS_TRANSMISSION_COLOR;
- case BL::RenderPass::type_SUBSURFACE_COLOR:
- return PASS_SUBSURFACE_COLOR;
-
- case BL::RenderPass::type_EMIT:
- return PASS_EMISSION;
- case BL::RenderPass::type_ENVIRONMENT:
- return PASS_BACKGROUND;
- case BL::RenderPass::type_AO:
- return PASS_AO;
- case BL::RenderPass::type_SHADOW:
- return PASS_SHADOW;
-
- case BL::RenderPass::type_DIFFUSE:
- case BL::RenderPass::type_COLOR:
- case BL::RenderPass::type_REFRACTION:
- case BL::RenderPass::type_SPECULAR:
- case BL::RenderPass::type_REFLECTION:
- return PASS_NONE;
-#ifdef WITH_CYCLES_DEBUG
- case BL::RenderPass::type_DEBUG:
- {
- switch(b_pass.debug_type()) {
- case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
- return PASS_BVH_TRAVERSED_NODES;
- case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
- return PASS_BVH_TRAVERSED_INSTANCES;
- case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
- return PASS_BVH_INTERSECTIONS;
- case BL::RenderPass::debug_type_RAY_BOUNCES:
- return PASS_RAY_BOUNCES;
- }
- break;
- }
-#endif
- }
-
- return PASS_NONE;
-}
-
static ShaderEvalType get_shader_type(const string& pass_type)
{
const char *shader_type = pass_type.c_str();
@@ -383,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
static void end_render_result(BL::RenderEngine& b_engine,
BL::RenderResult& b_rr,
bool cancel,
+ bool highlight,
bool do_merge_results)
{
- b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+ b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
}
-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
{
BufferParams& params = rtile.buffers->params;
int x = params.full_x - session->tile_manager.params.full_x;
@@ -424,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
update_render_result(b_rr, b_rlay, rtile);
}
- end_render_result(b_engine, b_rr, true, true);
+ end_render_result(b_engine, b_rr, true, highlight, true);
}
else {
/* write result */
write_render_result(b_rr, b_rlay, rtile);
- end_render_result(b_engine, b_rr, false, true);
+ end_render_result(b_engine, b_rr, false, false, true);
}
}
void BlenderSession::write_render_tile(RenderTile& rtile)
{
- do_write_update_render_tile(rtile, false);
+ do_write_update_render_tile(rtile, false, false);
}
-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
{
/* use final write for preview renders, otherwise render result wouldn't be
* be updated in blender side
* would need to be investigated a bit further, but for now shall be fine
*/
if(!b_engine.is_preview())
- do_write_update_render_tile(rtile, true);
+ do_write_update_render_tile(rtile, true, highlight);
else
- do_write_update_render_tile(rtile, false);
+ do_write_update_render_tile(rtile, false, false);
}
void BlenderSession::render()
{
/* set callback to write out render results */
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
- session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+ session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
/* get buffer parameters */
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -475,33 +392,38 @@ void BlenderSession::render()
/* layer will be missing if it was disabled in the UI */
if(b_single_rlay == b_rr.layers.end()) {
- end_render_result(b_engine, b_rr, true, false);
+ end_render_result(b_engine, b_rr, true, true, false);
continue;
}
BL::RenderLayer b_rlay = *b_single_rlay;
/* add passes */
- array<Pass> passes;
- Pass::add(PASS_COMBINED, passes);
-
- if(session_params.device.advanced_shading) {
-
- /* loop over passes */
- BL::RenderLayer::passes_iterator b_pass_iter;
-
- for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
- BL::RenderPass b_pass(*b_pass_iter);
- PassType pass_type = get_pass_type(b_pass);
+ array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
+ buffer_params.passes = passes;
- if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
- continue;
- if(pass_type != PASS_NONE)
- Pass::add(pass_type, passes);
- }
- }
+ PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+ bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
+ buffer_params.denoising_data_pass = use_denoising;
+ session->tile_manager.schedule_denoising = use_denoising;
+ session->params.use_denoising = use_denoising;
+ scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+ scene->film->denoising_flags = 0;
+ if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+ if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+ if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+ if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+ if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+ if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+ if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+ if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+ scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+ buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+ session->params.denoising_radius = get_int(crl, "denoising_radius");
+ session->params.denoising_strength = get_float(crl, "denoising_strength");
+ session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+ session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
- buffer_params.passes = passes;
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
scene->film->tag_passes_update(scene, passes);
scene->film->tag_update(scene);
@@ -555,7 +477,7 @@ void BlenderSession::render()
}
/* free result without merging */
- end_render_result(b_engine, b_rr, true, false);
+ end_render_result(b_engine, b_rr, true, true, false);
if(session->progress.get_cancel())
break;
@@ -636,8 +558,6 @@ void BlenderSession::bake(BL::Object& b_object,
float result[])
{
ShaderEvalType shader_type = get_shader_type(pass_type);
- size_t object_index = OBJECT_NONE;
- int tri_offset = 0;
/* Set baking flag in advance, so kernel loading can check if we need
* any baking capabilities.
@@ -647,9 +567,6 @@ void BlenderSession::bake(BL::Object& b_object,
/* ensure kernels are loaded before we do any scene updates */
session->load_kernels();
- if(session->progress.get_cancel())
- return;
-
if(shader_type == SHADER_EVAL_UV) {
/* force UV to be available */
Pass::add(PASS_UV, scene->film->passes);
@@ -667,50 +584,61 @@ void BlenderSession::bake(BL::Object& b_object,
scene->film->tag_update(scene);
scene->integrator->tag_update(scene);
- /* update scene */
- BL::Object b_camera_override(b_engine.camera_override());
- sync->sync_camera(b_render, b_camera_override, width, height, "");
- sync->sync_data(b_render,
- b_v3d,
- b_camera_override,
- width, height,
- &python_thread_state,
- b_rlay_name.c_str());
+ if(!session->progress.get_cancel()) {
+ /* update scene */
+ BL::Object b_camera_override(b_engine.camera_override());
+ sync->sync_camera(b_render, b_camera_override, width, height, "");
+ sync->sync_data(b_render,
+ b_v3d,
+ b_camera_override,
+ width, height,
+ &python_thread_state,
+ b_rlay_name.c_str());
+ }
- /* get buffer parameters */
- SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
- BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
+ BakeData *bake_data = NULL;
+
+ if(!session->progress.get_cancel()) {
+ /* get buffer parameters */
+ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+ BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
- scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
+ scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
- /* set number of samples */
- session->tile_manager.set_samples(session_params.samples);
- session->reset(buffer_params, session_params.samples);
- session->update_scene();
+ /* set number of samples */
+ session->tile_manager.set_samples(session_params.samples);
+ session->reset(buffer_params, session_params.samples);
+ session->update_scene();
- /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
- for(size_t i = 0; i < scene->objects.size(); i++) {
- if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
- object_index = i;
- tri_offset = scene->objects[i]->mesh->tri_offset;
- break;
- }
- }
+ /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
+ size_t object_index = OBJECT_NONE;
+ int tri_offset = 0;
- int object = object_index;
+ for(size_t i = 0; i < scene->objects.size(); i++) {
+ if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
+ object_index = i;
+ tri_offset = scene->objects[i]->mesh->tri_offset;
+ break;
+ }
+ }
- BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+ int object = object_index;
- populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
+ bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+ populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
- /* set number of samples */
- session->tile_manager.set_samples(session_params.samples);
- session->reset(buffer_params, session_params.samples);
- session->update_scene();
+ /* set number of samples */
+ session->tile_manager.set_samples(session_params.samples);
+ session->reset(buffer_params, session_params.samples);
+ session->update_scene();
- session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+ session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+ }
- scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+ /* Perform bake. Check cancel to avoid crash with incomplete scene data. */
+ if(!session->progress.get_cancel()) {
+ scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+ }
/* free all memory used (host and device), so we wouldn't leave render
* engine with extra memory allocated
@@ -753,19 +681,31 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
BL::RenderPass b_pass(*b_iter);
/* find matching pass type */
- PassType pass_type = get_pass_type(b_pass);
+ PassType pass_type = BlenderSync::get_pass_type(b_pass);
int components = b_pass.channels();
- /* copy pixels */
- if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]))
+ bool read = false;
+ if(pass_type != PASS_NONE) {
+ /* copy pixels */
+ read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
+ }
+ else {
+ int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+ if(denoising_offset >= 0) {
+ read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+ }
+ }
+
+ if(!read) {
memset(&pixels[0], 0, pixels.size()*sizeof(float));
+ }
b_pass.rect(&pixels[0]);
}
}
else {
/* copy combined pass */
- BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+ BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0]))
b_combined_pass.rect(&pixels[0]);
}
@@ -1073,7 +1013,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
int &width,
int &height,
int &depth,
- int &channels)
+ int &channels,
+ bool& free_cache)
{
/* empty image */
is_float = false;
@@ -1081,6 +1022,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
height = 1;
depth = 0;
channels = 0;
+ free_cache = false;
if(!builtin_data)
return;
@@ -1094,6 +1036,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
/* image data */
BL::Image b_image(b_id);
+ free_cache = !b_image.has_data();
is_float = b_image.is_float();
width = b_image.size()[0];
height = b_image.size()[1];
@@ -1154,7 +1097,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
bool BlenderSession::builtin_image_pixels(const string &builtin_name,
void *builtin_data,
unsigned char *pixels,
- const size_t pixels_size)
+ const size_t pixels_size,
+ const bool free_cache)
{
if(!builtin_data) {
return false;
@@ -1175,7 +1119,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
if(image_pixels && num_pixels * channels == pixels_size) {
memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
- MEM_freeN(image_pixels);
}
else {
if(channels == 1) {
@@ -1194,6 +1137,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
}
}
}
+
+ if(image_pixels) {
+ MEM_freeN(image_pixels);
+ }
+
+ /* Free image buffers to save memory during render. */
+ if(free_cache) {
+ b_image.buffers_free();
+ }
+
/* Premultiply, byte images are always straight for Blender. */
unsigned char *cp = pixels;
for(size_t i = 0; i < num_pixels; i++, cp += channels) {
@@ -1207,7 +1160,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
void *builtin_data,
float *pixels,
- const size_t pixels_size)
+ const size_t pixels_size,
+ const bool free_cache)
{
if(!builtin_data) {
return false;
@@ -1232,7 +1186,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
if(image_pixels && num_pixels * channels == pixels_size) {
memcpy(pixels, image_pixels, pixels_size * sizeof(float));
- MEM_freeN(image_pixels);
}
else {
if(channels == 1) {
@@ -1252,6 +1205,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
}
}
+ if(image_pixels) {
+ MEM_freeN(image_pixels);
+ }
+
+ /* Free image buffers to save memory during render. */
+ if(free_cache) {
+ b_image.buffers_free();
+ }
+
return true;
}
else if(b_id.is_a(&RNA_Object)) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 22b21a18f2e..cbd2303d282 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -79,7 +79,7 @@ public:
void update_render_result(BL::RenderResult& b_rr,
BL::RenderLayer& b_rlay,
RenderTile& rtile);
- void update_render_tile(RenderTile& rtile);
+ void update_render_tile(RenderTile& rtile, bool highlight);
/* interactive updates */
void synchronize();
@@ -147,7 +147,7 @@ protected:
BL::RenderLayer& b_rlay,
RenderTile& rtile,
bool do_update_only);
- void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+ void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
int builtin_image_frame(const string &builtin_name);
void builtin_image_info(const string &builtin_name,
@@ -156,15 +156,18 @@ protected:
int &width,
int &height,
int &depth,
- int &channels);
+ int &channels,
+ bool &free_cache);
bool builtin_image_pixels(const string &builtin_name,
void *builtin_data,
unsigned char *pixels,
- const size_t pixels_size);
+ const size_t pixels_size,
+ const bool free_cache);
bool builtin_image_float_pixels(const string &builtin_name,
void *builtin_data,
float *pixels,
- const size_t pixels_size);
+ const size_t pixels_size,
+ const bool free_cache);
/* Update tile manager to reflect resumable render settings. */
void update_resumable_tile_manager(int num_samples);
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 3f04f11aab4..bdbab1006c0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -521,6 +521,19 @@ static ShaderNode *add_node(Scene *scene,
}
node = hair;
}
+ else if(b_node.is_a(&RNA_ShaderNodeBsdfPrincipled)) {
+ BL::ShaderNodeBsdfPrincipled b_principled_node(b_node);
+ PrincipledBsdfNode *principled = new PrincipledBsdfNode();
+ switch (b_principled_node.distribution()) {
+ case BL::ShaderNodeBsdfPrincipled::distribution_GGX:
+ principled->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+ break;
+ case BL::ShaderNodeBsdfPrincipled::distribution_MULTI_GGX:
+ principled->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+ break;
+ }
+ node = principled;
+ }
else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
node = new TranslucentBsdfNode();
}
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 3b071bf0e7d..3a00384458a 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -210,10 +210,9 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render,
scene->need_motion() == Scene::MOTION_NONE ||
scene->camera->motion_position == Camera::MOTION_POSITION_CENTER)
{
- sync_objects(b_v3d);
+ sync_objects();
}
sync_motion(b_render,
- b_v3d,
b_override,
width, height,
python_thread_state);
@@ -330,6 +329,9 @@ void BlenderSync::sync_integrator()
integrator->ao_bounces = get_int(cscene, "ao_bounces_render");
}
}
+ else {
+ integrator->ao_bounces = 0;
+ }
if(integrator->modified(previntegrator))
integrator->tag_update(scene);
@@ -480,6 +482,137 @@ void BlenderSync::sync_images()
}
}
+/* Passes */
+PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
+{
+ string name = b_pass.name();
+#define MAP_PASS(passname, passtype) if(name == passname) return passtype;
+ /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+ MAP_PASS("Combined", PASS_COMBINED);
+ MAP_PASS("Depth", PASS_DEPTH);
+ MAP_PASS("Mist", PASS_MIST);
+ MAP_PASS("Normal", PASS_NORMAL);
+ MAP_PASS("IndexOB", PASS_OBJECT_ID);
+ MAP_PASS("UV", PASS_UV);
+ MAP_PASS("Vector", PASS_MOTION);
+ MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+
+ MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
+ MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
+ MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
+ MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
+
+ MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
+ MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
+ MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
+ MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
+
+ MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
+ MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
+ MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+ MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
+
+ MAP_PASS("Emit", PASS_EMISSION);
+ MAP_PASS("Env", PASS_BACKGROUND);
+ MAP_PASS("AO", PASS_AO);
+ MAP_PASS("Shadow", PASS_SHADOW);
+
+#ifdef __KERNEL_DEBUG__
+ MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES);
+ MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES);
+ MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS);
+ MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
+#endif
+#undef MAP_PASS
+
+ return PASS_NONE;
+}
+
+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+ string name = b_pass.name();
+ if(name.substr(0, 10) != "Denoising ") {
+ return -1;
+ }
+ name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+ MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+ MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+ MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+ MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+ MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+ MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+ MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+ MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+ MAP_PASS("Image", DENOISING_PASS_COLOR);
+ MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+ return -1;
+}
+
+array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
+ BL::SceneRenderLayer& b_srlay,
+ const SessionParams &session_params)
+{
+ array<Pass> passes;
+ Pass::add(PASS_COMBINED, passes);
+
+ if(!session_params.device.advanced_shading) {
+ return passes;
+ }
+
+ /* loop over passes */
+ BL::RenderLayer::passes_iterator b_pass_iter;
+
+ for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
+ BL::RenderPass b_pass(*b_pass_iter);
+ PassType pass_type = get_pass_type(b_pass);
+
+ if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
+ continue;
+ if(pass_type != PASS_NONE)
+ Pass::add(pass_type, passes);
+ }
+
+ PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+ if(get_boolean(crp, "denoising_store_passes") &&
+ get_boolean(crp, "use_denoising") &&
+ !session_params.progressive_refine) {
+ b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
+ }
+#ifdef __KERNEL_DEBUG__
+ if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+ b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
+ Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
+ }
+ if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+ b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str());
+ Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes);
+ }
+ if(get_boolean(crp, "pass_debug_bvh_intersections")) {
+ b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str());
+ Pass::add(PASS_BVH_INTERSECTIONS, passes);
+ }
+ if(get_boolean(crp, "pass_debug_ray_bounces")) {
+ b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str());
+ Pass::add(PASS_RAY_BOUNCES, passes);
+ }
+#endif
+
+ return passes;
+}
+
/* Scene Parameters */
SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 36bedc505af..4ec46424b5a 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -67,6 +67,9 @@ public:
void **python_thread_state,
const char *layer = 0);
void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
+ array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
+ BL::SceneRenderLayer& b_srlay,
+ const SessionParams &session_params);
void sync_integrator();
void sync_camera(BL::RenderSettings& b_render,
BL::Object& b_override,
@@ -93,13 +96,15 @@ public:
Camera *cam,
int width, int height);
+ static PassType get_pass_type(BL::RenderPass& b_pass);
+ static int get_denoising_pass(BL::RenderPass& b_pass);
+
private:
/* sync */
void sync_lamps(bool update_all);
void sync_materials(bool update_all);
- void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f);
+ void sync_objects(float motion_time = 0.0f);
void sync_motion(BL::RenderSettings& b_render,
- BL::SpaceView3D& b_v3d,
BL::Object& b_override,
int width, int height,
void **python_thread_state);
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index abdbb6be0fd..363e19f7a20 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
bool calc_undeformed,
Mesh::SubdivisionType subdivision_type)
{
- bool subsurf_mod_show_render;
- bool subsurf_mod_show_viewport;
+ bool subsurf_mod_show_render = false;
+ bool subsurf_mod_show_viewport = false;
if(subdivision_type != Mesh::SUBDIVISION_NONE) {
BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
@@ -299,7 +299,7 @@ static inline uint get_layer(const BL::Array<int, 20>& array)
for(uint i = 0; i < 20; i++)
if(array[i])
layer |= (1 << i);
-
+
return layer;
}
@@ -434,7 +434,7 @@ static inline string get_string(PointerRNA& ptr, const char *name)
string str(cstr);
if(cstr != cstrbuf)
MEM_freeN(cstr);
-
+
return str;
}
@@ -451,7 +451,7 @@ static inline string blender_absolute_path(BL::BlendData& b_data,
{
if(path.size() >= 2 && path[0] == '/' && path[1] == '/') {
string dirname;
-
+
if(b_id.library()) {
BL::ID b_library_id(b_id.library());
dirname = blender_absolute_path(b_data,
@@ -544,7 +544,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object& b_ob)
return b_smd.domain_settings();
}
}
-
+
return BL::SmokeDomainSettings(PointerRNA_NULL);
}
@@ -816,4 +816,3 @@ protected:
CCL_NAMESPACE_END
#endif /* __BLENDER_UTIL_H__ */
-
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 4701d75350a..6078db5a8ca 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -8,6 +8,8 @@ set(INC_SYS
set(SRC
bvh.cpp
+ bvh2.cpp
+ bvh4.cpp
bvh_binning.cpp
bvh_build.cpp
bvh_node.cpp
@@ -18,6 +20,8 @@ set(SRC
set(SRC_HEADERS
bvh.h
+ bvh2.h
+ bvh4.h
bvh_binning.h
bvh_build.h
bvh_node.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 58348d16746..33143e2d8aa 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,45 +15,32 @@
* limitations under the License.
*/
+#include "bvh/bvh.h"
+
#include "render/mesh.h"
#include "render/object.h"
-#include "render/scene.h"
-#include "render/curves.h"
-#include "bvh/bvh.h"
+#include "bvh/bvh2.h"
+#include "bvh/bvh4.h"
#include "bvh/bvh_build.h"
#include "bvh/bvh_node.h"
-#include "bvh/bvh_params.h"
-#include "bvh/bvh_unaligned.h"
-#include "util/util_debug.h"
#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_types.h"
-#include "util/util_math.h"
CCL_NAMESPACE_BEGIN
/* Pack Utility */
-struct BVHStackEntry
+BVHStackEntry::BVHStackEntry(const BVHNode *n, int i)
+ : node(n), idx(i)
{
- const BVHNode *node;
- int idx;
-
- BVHStackEntry(const BVHNode* n = 0, int i = 0)
- : node(n), idx(i)
- {
- }
+}
- int encodeIdx() const
- {
- return (node->is_leaf())? ~idx: idx;
- }
-};
+int BVHStackEntry::encodeIdx() const
+{
+ return (node->is_leaf())? ~idx: idx;
+}
/* BVH */
@@ -65,9 +52,9 @@ BVH::BVH(const BVHParams& params_, const vector<Object*>& objects_)
BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
{
if(params.use_qbvh)
- return new QBVH(params, objects);
+ return new BVH4(params, objects);
else
- return new BinaryBVH(params, objects);
+ return new BVH2(params, objects);
}
/* Building */
@@ -418,832 +405,4 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
}
}
-/* Regular BVH */
-
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
- const BVHNode *node0 = node->get_child(0),
- *node1 = node->get_child(1);
- return node0->is_unaligned || node1->is_unaligned;
-}
-
-BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
-}
-
-void BinaryBVH::pack_leaf(const BVHStackEntry& e,
- const LeafNode *leaf)
-{
- assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
- float4 data[BVH_NODE_LEAF_SIZE];
- memset(data, 0, sizeof(data));
- if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
- /* object */
- data[0].x = __int_as_float(~(leaf->lo));
- data[0].y = __int_as_float(0);
- }
- else {
- /* triangle */
- data[0].x = __int_as_float(leaf->lo);
- data[0].y = __int_as_float(leaf->hi);
- }
- data[0].z = __uint_as_float(leaf->visibility);
- if(leaf->num_triangles() != 0) {
- data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
- }
-
- memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
-}
-
-void BinaryBVH::pack_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1)
-{
- if(e0.node->is_unaligned || e1.node->is_unaligned) {
- pack_unaligned_inner(e, e0, e1);
- } else {
- pack_aligned_inner(e, e0, e1);
- }
-}
-
-void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1)
-{
- pack_aligned_node(e.idx,
- e0.node->bounds, e1.node->bounds,
- e0.encodeIdx(), e1.encodeIdx(),
- e0.node->visibility, e1.node->visibility);
-}
-
-void BinaryBVH::pack_aligned_node(int idx,
- const BoundBox& b0,
- const BoundBox& b1,
- int c0, int c1,
- uint visibility0, uint visibility1)
-{
- assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
- assert(c0 < 0 || c0 < pack.nodes.size());
- assert(c1 < 0 || c1 < pack.nodes.size());
-
- int4 data[BVH_NODE_SIZE] = {
- make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
- visibility1 & ~PATH_RAY_NODE_UNALIGNED,
- c0, c1),
- make_int4(__float_as_int(b0.min.x),
- __float_as_int(b1.min.x),
- __float_as_int(b0.max.x),
- __float_as_int(b1.max.x)),
- make_int4(__float_as_int(b0.min.y),
- __float_as_int(b1.min.y),
- __float_as_int(b0.max.y),
- __float_as_int(b1.max.y)),
- make_int4(__float_as_int(b0.min.z),
- __float_as_int(b1.min.z),
- __float_as_int(b0.max.z),
- __float_as_int(b1.max.z)),
- };
-
- memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
-}
-
-void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1)
-{
- pack_unaligned_node(e.idx,
- e0.node->get_aligned_space(),
- e1.node->get_aligned_space(),
- e0.node->bounds,
- e1.node->bounds,
- e0.encodeIdx(), e1.encodeIdx(),
- e0.node->visibility, e1.node->visibility);
-}
-
-void BinaryBVH::pack_unaligned_node(int idx,
- const Transform& aligned_space0,
- const Transform& aligned_space1,
- const BoundBox& bounds0,
- const BoundBox& bounds1,
- int c0, int c1,
- uint visibility0, uint visibility1)
-{
- assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
- assert(c0 < 0 || c0 < pack.nodes.size());
- assert(c1 < 0 || c1 < pack.nodes.size());
-
- float4 data[BVH_UNALIGNED_NODE_SIZE];
- Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
- aligned_space0);
- Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
- aligned_space1);
- data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
- __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
- __int_as_float(c0),
- __int_as_float(c1));
-
- data[1] = space0.x;
- data[2] = space0.y;
- data[3] = space0.z;
- data[4] = space1.x;
- data[5] = space1.y;
- data[6] = space1.z;
-
- memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
-}
-
-void BinaryBVH::pack_nodes(const BVHNode *root)
-{
- const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
- const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
- assert(num_leaf_nodes <= num_nodes);
- const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
- size_t node_size;
- if(params.use_unaligned_nodes) {
- const size_t num_unaligned_nodes =
- root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
- node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
- (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
- }
- else {
- node_size = num_inner_nodes * BVH_NODE_SIZE;
- }
- /* Resize arrays */
- pack.nodes.clear();
- pack.leaf_nodes.clear();
- /* For top level BVH, first merge existing BVH's so we know the offsets. */
- if(params.top_level) {
- pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
- }
- else {
- pack.nodes.resize(node_size);
- pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
- }
-
- int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
- vector<BVHStackEntry> stack;
- stack.reserve(BVHParams::MAX_DEPTH*2);
- if(root->is_leaf()) {
- stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
- }
- else {
- stack.push_back(BVHStackEntry(root, nextNodeIdx));
- nextNodeIdx += node_bvh_is_unaligned(root)
- ? BVH_UNALIGNED_NODE_SIZE
- : BVH_NODE_SIZE;
- }
-
- while(stack.size()) {
- BVHStackEntry e = stack.back();
- stack.pop_back();
-
- if(e.node->is_leaf()) {
- /* leaf node */
- const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
- pack_leaf(e, leaf);
- }
- else {
- /* innner node */
- int idx[2];
- for(int i = 0; i < 2; ++i) {
- if(e.node->get_child(i)->is_leaf()) {
- idx[i] = nextLeafNodeIdx++;
- }
- else {
- idx[i] = nextNodeIdx;
- nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
- ? BVH_UNALIGNED_NODE_SIZE
- : BVH_NODE_SIZE;
- }
- }
-
- stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
- stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
-
- pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
- }
- }
- assert(node_size == nextNodeIdx);
- /* root index to start traversal at, to handle case of single leaf node */
- pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void BinaryBVH::refit_nodes()
-{
- assert(!params.top_level);
-
- BoundBox bbox = BoundBox::empty;
- uint visibility = 0;
- refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
- if(leaf) {
- assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
- const int4 *data = &pack.leaf_nodes[idx];
- const int c0 = data[0].x;
- const int c1 = data[0].y;
- /* refit leaf node */
- for(int prim = c0; prim < c1; prim++) {
- int pidx = pack.prim_index[prim];
- int tob = pack.prim_object[prim];
- Object *ob = objects[tob];
-
- if(pidx == -1) {
- /* object instance */
- bbox.grow(ob->bounds);
- }
- else {
- /* primitives */
- const Mesh *mesh = ob->mesh;
-
- if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
- /* curves */
- int str_offset = (params.top_level)? mesh->curve_offset: 0;
- Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
- int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
- curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
- visibility |= PATH_RAY_CURVE;
-
- /* motion curves */
- if(mesh->use_motion_blur) {
- Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
- if(attr) {
- size_t mesh_size = mesh->curve_keys.size();
- size_t steps = mesh->motion_steps - 1;
- float3 *key_steps = attr->data_float3();
-
- for(size_t i = 0; i < steps; i++)
- curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
- }
- }
- }
- else {
- /* triangles */
- int tri_offset = (params.top_level)? mesh->tri_offset: 0;
- Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
- const float3 *vpos = &mesh->verts[0];
-
- triangle.bounds_grow(vpos, bbox);
-
- /* motion triangles */
- if(mesh->use_motion_blur) {
- Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
- if(attr) {
- size_t mesh_size = mesh->verts.size();
- size_t steps = mesh->motion_steps - 1;
- float3 *vert_steps = attr->data_float3();
-
- for(size_t i = 0; i < steps; i++)
- triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
- }
- }
- }
- }
-
- visibility |= ob->visibility;
- }
-
- /* TODO(sergey): De-duplicate with pack_leaf(). */
- float4 leaf_data[BVH_NODE_LEAF_SIZE];
- leaf_data[0].x = __int_as_float(c0);
- leaf_data[0].y = __int_as_float(c1);
- leaf_data[0].z = __uint_as_float(visibility);
- leaf_data[0].w = __uint_as_float(data[0].w);
- memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
- }
- else {
- assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
-
- const int4 *data = &pack.nodes[idx];
- const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
- const int c0 = data[0].z;
- const int c1 = data[0].w;
- /* refit inner node, set bbox from children */
- BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
- uint visibility0 = 0, visibility1 = 0;
-
- refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
- refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
-
- if(is_unaligned) {
- Transform aligned_space = transform_identity();
- pack_unaligned_node(idx,
- aligned_space, aligned_space,
- bbox0, bbox1,
- c0, c1,
- visibility0,
- visibility1);
- }
- else {
- pack_aligned_node(idx,
- bbox0, bbox1,
- c0, c1,
- visibility0,
- visibility1);
- }
-
- bbox.grow(bbox0);
- bbox.grow(bbox1);
- visibility = visibility0|visibility1;
- }
-}
-
-/* QBVH */
-
-/* Can we avoid this somehow or make more generic?
- *
- * Perhaps we can merge nodes in actual tree and make our
- * life easier all over the place.
- */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
- const BVHNode *node0 = node->get_child(0),
- *node1 = node->get_child(1);
- bool has_unaligned = false;
- if(node0->is_leaf()) {
- has_unaligned |= node0->is_unaligned;
- }
- else {
- has_unaligned |= node0->get_child(0)->is_unaligned;
- has_unaligned |= node0->get_child(1)->is_unaligned;
- }
- if(node1->is_leaf()) {
- has_unaligned |= node1->is_unaligned;
- }
- else {
- has_unaligned |= node1->get_child(0)->is_unaligned;
- has_unaligned |= node1->get_child(1)->is_unaligned;
- }
- return has_unaligned;
-}
-
-QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
- params.use_qbvh = true;
-}
-
-void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
-{
- float4 data[BVH_QNODE_LEAF_SIZE];
- memset(data, 0, sizeof(data));
- if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
- /* object */
- data[0].x = __int_as_float(~(leaf->lo));
- data[0].y = __int_as_float(0);
- }
- else {
- /* triangle */
- data[0].x = __int_as_float(leaf->lo);
- data[0].y = __int_as_float(leaf->hi);
- }
- data[0].z = __uint_as_float(leaf->visibility);
- if(leaf->num_triangles() != 0) {
- data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
- }
-
- memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
-}
-
-void QBVH::pack_inner(const BVHStackEntry& e,
- const BVHStackEntry *en,
- int num)
-{
- bool has_unaligned = false;
- /* Check whether we have to create unaligned node or all nodes are aligned
- * and we can cut some corner here.
- */
- if(params.use_unaligned_nodes) {
- for(int i = 0; i < num; i++) {
- if(en[i].node->is_unaligned) {
- has_unaligned = true;
- break;
- }
- }
- }
- if(has_unaligned) {
- /* There's no unaligned children, pack into AABB node. */
- pack_unaligned_inner(e, en, num);
- }
- else {
- /* Create unaligned node with orientation transform for each of the
- * children.
- */
- pack_aligned_inner(e, en, num);
- }
-}
-
-void QBVH::pack_aligned_inner(const BVHStackEntry& e,
- const BVHStackEntry *en,
- int num)
-{
- BoundBox bounds[4];
- int child[4];
- for(int i = 0; i < num; ++i) {
- bounds[i] = en[i].node->bounds;
- child[i] = en[i].encodeIdx();
- }
- pack_aligned_node(e.idx,
- bounds,
- child,
- e.node->visibility,
- e.node->time_from,
- e.node->time_to,
- num);
-}
-
-void QBVH::pack_aligned_node(int idx,
- const BoundBox *bounds,
- const int *child,
- const uint visibility,
- const float time_from,
- const float time_to,
- const int num)
-{
- float4 data[BVH_QNODE_SIZE];
- memset(data, 0, sizeof(data));
-
- data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
- data[0].y = time_from;
- data[0].z = time_to;
-
- for(int i = 0; i < num; i++) {
- float3 bb_min = bounds[i].min;
- float3 bb_max = bounds[i].max;
-
- data[1][i] = bb_min.x;
- data[2][i] = bb_max.x;
- data[3][i] = bb_min.y;
- data[4][i] = bb_max.y;
- data[5][i] = bb_min.z;
- data[6][i] = bb_max.z;
-
- data[7][i] = __int_as_float(child[i]);
- }
-
- for(int i = num; i < 4; i++) {
- /* We store BB which would never be recorded as intersection
- * so kernel might safely assume there are always 4 child nodes.
- */
- data[1][i] = FLT_MAX;
- data[2][i] = -FLT_MAX;
-
- data[3][i] = FLT_MAX;
- data[4][i] = -FLT_MAX;
-
- data[5][i] = FLT_MAX;
- data[6][i] = -FLT_MAX;
-
- data[7][i] = __int_as_float(0);
- }
-
- memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
-}
-
-void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
- const BVHStackEntry *en,
- int num)
-{
- Transform aligned_space[4];
- BoundBox bounds[4];
- int child[4];
- for(int i = 0; i < num; ++i) {
- aligned_space[i] = en[i].node->get_aligned_space();
- bounds[i] = en[i].node->bounds;
- child[i] = en[i].encodeIdx();
- }
- pack_unaligned_node(e.idx,
- aligned_space,
- bounds,
- child,
- e.node->visibility,
- e.node->time_from,
- e.node->time_to,
- num);
-}
-
-void QBVH::pack_unaligned_node(int idx,
- const Transform *aligned_space,
- const BoundBox *bounds,
- const int *child,
- const uint visibility,
- const float time_from,
- const float time_to,
- const int num)
-{
- float4 data[BVH_UNALIGNED_QNODE_SIZE];
- memset(data, 0, sizeof(data));
-
- data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
- data[0].y = time_from;
- data[0].z = time_to;
-
- for(int i = 0; i < num; i++) {
- Transform space = BVHUnaligned::compute_node_transform(
- bounds[i],
- aligned_space[i]);
-
- data[1][i] = space.x.x;
- data[2][i] = space.x.y;
- data[3][i] = space.x.z;
-
- data[4][i] = space.y.x;
- data[5][i] = space.y.y;
- data[6][i] = space.y.z;
-
- data[7][i] = space.z.x;
- data[8][i] = space.z.y;
- data[9][i] = space.z.z;
-
- data[10][i] = space.x.w;
- data[11][i] = space.y.w;
- data[12][i] = space.z.w;
-
- data[13][i] = __int_as_float(child[i]);
- }
-
- for(int i = num; i < 4; i++) {
- /* We store BB which would never be recorded as intersection
- * so kernel might safely assume there are always 4 child nodes.
- */
-
- data[1][i] = 1.0f;
- data[2][i] = 0.0f;
- data[3][i] = 0.0f;
-
- data[4][i] = 0.0f;
- data[5][i] = 0.0f;
- data[6][i] = 0.0f;
-
- data[7][i] = 0.0f;
- data[8][i] = 0.0f;
- data[9][i] = 0.0f;
-
- data[10][i] = -FLT_MAX;
- data[11][i] = -FLT_MAX;
- data[12][i] = -FLT_MAX;
-
- data[13][i] = __int_as_float(0);
- }
-
- memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void QBVH::pack_nodes(const BVHNode *root)
-{
- /* Calculate size of the arrays required. */
- const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
- const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
- assert(num_leaf_nodes <= num_nodes);
- const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
- size_t node_size;
- if(params.use_unaligned_nodes) {
- const size_t num_unaligned_nodes =
- root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
- node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
- (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
- }
- else {
- node_size = num_inner_nodes * BVH_QNODE_SIZE;
- }
- /* Resize arrays. */
- pack.nodes.clear();
- pack.leaf_nodes.clear();
- /* For top level BVH, first merge existing BVH's so we know the offsets. */
- if(params.top_level) {
- pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
- }
- else {
- pack.nodes.resize(node_size);
- pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
- }
-
- int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
- vector<BVHStackEntry> stack;
- stack.reserve(BVHParams::MAX_DEPTH*2);
- if(root->is_leaf()) {
- stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
- }
- else {
- stack.push_back(BVHStackEntry(root, nextNodeIdx));
- nextNodeIdx += node_qbvh_is_unaligned(root)
- ? BVH_UNALIGNED_QNODE_SIZE
- : BVH_QNODE_SIZE;
- }
-
- while(stack.size()) {
- BVHStackEntry e = stack.back();
- stack.pop_back();
-
- if(e.node->is_leaf()) {
- /* leaf node */
- const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
- pack_leaf(e, leaf);
- }
- else {
- /* Inner node. */
- const BVHNode *node = e.node;
- const BVHNode *node0 = node->get_child(0);
- const BVHNode *node1 = node->get_child(1);
- /* Collect nodes. */
- const BVHNode *nodes[4];
- int numnodes = 0;
- if(node0->is_leaf()) {
- nodes[numnodes++] = node0;
- }
- else {
- nodes[numnodes++] = node0->get_child(0);
- nodes[numnodes++] = node0->get_child(1);
- }
- if(node1->is_leaf()) {
- nodes[numnodes++] = node1;
- }
- else {
- nodes[numnodes++] = node1->get_child(0);
- nodes[numnodes++] = node1->get_child(1);
- }
- /* Push entries on the stack. */
- for(int i = 0; i < numnodes; ++i) {
- int idx;
- if(nodes[i]->is_leaf()) {
- idx = nextLeafNodeIdx++;
- }
- else {
- idx = nextNodeIdx;
- nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
- ? BVH_UNALIGNED_QNODE_SIZE
- : BVH_QNODE_SIZE;
- }
- stack.push_back(BVHStackEntry(nodes[i], idx));
- }
- /* Set node. */
- pack_inner(e, &stack[stack.size()-numnodes], numnodes);
- }
- }
- assert(node_size == nextNodeIdx);
- /* Root index to start traversal at, to handle case of single leaf node. */
- pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void QBVH::refit_nodes()
-{
- assert(!params.top_level);
-
- BoundBox bbox = BoundBox::empty;
- uint visibility = 0;
- refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
- if(leaf) {
- int4 *data = &pack.leaf_nodes[idx];
- int4 c = data[0];
- /* Refit leaf node. */
- for(int prim = c.x; prim < c.y; prim++) {
- int pidx = pack.prim_index[prim];
- int tob = pack.prim_object[prim];
- Object *ob = objects[tob];
-
- if(pidx == -1) {
- /* Object instance. */
- bbox.grow(ob->bounds);
- }
- else {
- /* Primitives. */
- const Mesh *mesh = ob->mesh;
-
- if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
- /* Curves. */
- int str_offset = (params.top_level)? mesh->curve_offset: 0;
- Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
- int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
- curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
- visibility |= PATH_RAY_CURVE;
-
- /* Motion curves. */
- if(mesh->use_motion_blur) {
- Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
- if(attr) {
- size_t mesh_size = mesh->curve_keys.size();
- size_t steps = mesh->motion_steps - 1;
- float3 *key_steps = attr->data_float3();
-
- for(size_t i = 0; i < steps; i++)
- curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
- }
- }
- }
- else {
- /* Triangles. */
- int tri_offset = (params.top_level)? mesh->tri_offset: 0;
- Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
- const float3 *vpos = &mesh->verts[0];
-
- triangle.bounds_grow(vpos, bbox);
-
- /* Motion triangles. */
- if(mesh->use_motion_blur) {
- Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
- if(attr) {
- size_t mesh_size = mesh->verts.size();
- size_t steps = mesh->motion_steps - 1;
- float3 *vert_steps = attr->data_float3();
-
- for(size_t i = 0; i < steps; i++)
- triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
- }
- }
- }
- }
-
- visibility |= ob->visibility;
- }
-
- /* TODO(sergey): This is actually a copy of pack_leaf(),
- * but this chunk of code only knows actual data and has
- * no idea about BVHNode.
- *
- * Would be nice to de-duplicate code, but trying to make
- * making code more general ends up in much nastier code
- * in my opinion so far.
- *
- * Same applies to the inner nodes case below.
- */
- float4 leaf_data[BVH_QNODE_LEAF_SIZE];
- leaf_data[0].x = __int_as_float(c.x);
- leaf_data[0].y = __int_as_float(c.y);
- leaf_data[0].z = __uint_as_float(visibility);
- leaf_data[0].w = __uint_as_float(c.w);
- memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
- }
- else {
- int4 *data = &pack.nodes[idx];
- bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
- int4 c;
- if(is_unaligned) {
- c = data[13];
- }
- else {
- c = data[7];
- }
- /* Refit inner node, set bbox from children. */
- BoundBox child_bbox[4] = {BoundBox::empty,
- BoundBox::empty,
- BoundBox::empty,
- BoundBox::empty};
- uint child_visibility[4] = {0};
- int num_nodes = 0;
-
- for(int i = 0; i < 4; ++i) {
- if(c[i] != 0) {
- refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
- child_bbox[i], child_visibility[i]);
- ++num_nodes;
- bbox.grow(child_bbox[i]);
- visibility |= child_visibility[i];
- }
- }
-
- if(is_unaligned) {
- Transform aligned_space[4] = {transform_identity(),
- transform_identity(),
- transform_identity(),
- transform_identity()};
- pack_unaligned_node(idx,
- aligned_space,
- child_bbox,
- &c[0],
- visibility,
- 0.0f,
- 1.0f,
- 4);
- }
- else {
- pack_aligned_node(idx,
- child_bbox,
- &c[0],
- visibility,
- 0.0f,
- 1.0f,
- 4);
- }
- }
-}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 60bc62ee6e4..7bac6112fd9 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -33,15 +33,8 @@ class LeafNode;
class Object;
class Progress;
-#define BVH_NODE_SIZE 4
-#define BVH_NODE_LEAF_SIZE 1
-#define BVH_QNODE_SIZE 8
-#define BVH_QNODE_LEAF_SIZE 1
-#define BVH_ALIGN 4096
-#define TRI_NODE_SIZE 3
-
-#define BVH_UNALIGNED_NODE_SIZE 7
-#define BVH_UNALIGNED_QNODE_SIZE 14
+#define BVH_ALIGN 4096
+#define TRI_NODE_SIZE 3
/* Packed BVH
*
@@ -54,7 +47,7 @@ struct PackedBVH {
/* BVH leaf nodes storage. */
array<int4> leaf_nodes;
/* object index to BVH node index mapping for instances */
- array<int> object_node;
+ array<int> object_node;
/* Mapping from primitive index to index in triangle array. */
array<uint> prim_tri_index;
/* Continuous storage of triangle vertices. */
@@ -110,95 +103,16 @@ protected:
virtual void refit_nodes() = 0;
};
-/* Binary BVH
- *
- * Typical BVH with each node having two children. */
-
-class BinaryBVH : public BVH {
-protected:
- /* constructor */
- friend class BVH;
- BinaryBVH(const BVHParams& params, const vector<Object*>& objects);
-
- /* pack */
- void pack_nodes(const BVHNode *root);
-
- void pack_leaf(const BVHStackEntry& e,
- const LeafNode *leaf);
- void pack_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1);
-
- void pack_aligned_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1);
- void pack_aligned_node(int idx,
- const BoundBox& b0,
- const BoundBox& b1,
- int c0, int c1,
- uint visibility0, uint visibility1);
-
- void pack_unaligned_inner(const BVHStackEntry& e,
- const BVHStackEntry& e0,
- const BVHStackEntry& e1);
- void pack_unaligned_node(int idx,
- const Transform& aligned_space0,
- const Transform& aligned_space1,
- const BoundBox& b0,
- const BoundBox& b1,
- int c0, int c1,
- uint visibility0, uint visibility1);
-
- /* refit */
- void refit_nodes();
- void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
-};
-
-/* QBVH
- *
- * Quad BVH, with each node having four children, to use with SIMD instructions. */
+/* Pack Utility */
+struct BVHStackEntry
+{
+ const BVHNode *node;
+ int idx;
-class QBVH : public BVH {
-protected:
- /* constructor */
- friend class BVH;
- QBVH(const BVHParams& params, const vector<Object*>& objects);
-
- /* pack */
- void pack_nodes(const BVHNode *root);
-
- void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
- void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
-
- void pack_aligned_inner(const BVHStackEntry& e,
- const BVHStackEntry *en,
- int num);
- void pack_aligned_node(int idx,
- const BoundBox *bounds,
- const int *child,
- const uint visibility,
- const float time_from,
- const float time_to,
- const int num);
-
- void pack_unaligned_inner(const BVHStackEntry& e,
- const BVHStackEntry *en,
- int num);
- void pack_unaligned_node(int idx,
- const Transform *aligned_space,
- const BoundBox *bounds,
- const int *child,
- const uint visibility,
- const float time_from,
- const float time_to,
- const int num);
-
- /* refit */
- void refit_nodes();
- void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+ BVHStackEntry(const BVHNode *n = 0, int i = 0);
+ int encodeIdx() const;
};
CCL_NAMESPACE_END
#endif /* __BVH_H__ */
-
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
new file mode 100644
index 00000000000..340ba7dcf53
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -0,0 +1,364 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh2.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool node_bvh_is_unaligned(const BVHNode *node)
+{
+ const BVHNode *node0 = node->get_child(0),
+ *node1 = node->get_child(1);
+ return node0->is_unaligned || node1->is_unaligned;
+}
+
+BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH2::pack_leaf(const BVHStackEntry& e,
+ const LeafNode *leaf)
+{
+ assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+ float4 data[BVH_NODE_LEAF_SIZE];
+ memset(data, 0, sizeof(data));
+ if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+ /* object */
+ data[0].x = __int_as_float(~(leaf->lo));
+ data[0].y = __int_as_float(0);
+ }
+ else {
+ /* triangle */
+ data[0].x = __int_as_float(leaf->lo);
+ data[0].y = __int_as_float(leaf->hi);
+ }
+ data[0].z = __uint_as_float(leaf->visibility);
+ if(leaf->num_triangles() != 0) {
+ data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+ }
+
+ memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+}
+
+void BVH2::pack_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1)
+{
+ if(e0.node->is_unaligned || e1.node->is_unaligned) {
+ pack_unaligned_inner(e, e0, e1);
+ } else {
+ pack_aligned_inner(e, e0, e1);
+ }
+}
+
+void BVH2::pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1)
+{
+ pack_aligned_node(e.idx,
+ e0.node->bounds, e1.node->bounds,
+ e0.encodeIdx(), e1.encodeIdx(),
+ e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_aligned_node(int idx,
+ const BoundBox& b0,
+ const BoundBox& b1,
+ int c0, int c1,
+ uint visibility0, uint visibility1)
+{
+ assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+ assert(c0 < 0 || c0 < pack.nodes.size());
+ assert(c1 < 0 || c1 < pack.nodes.size());
+
+ int4 data[BVH_NODE_SIZE] = {
+ make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
+ visibility1 & ~PATH_RAY_NODE_UNALIGNED,
+ c0, c1),
+ make_int4(__float_as_int(b0.min.x),
+ __float_as_int(b1.min.x),
+ __float_as_int(b0.max.x),
+ __float_as_int(b1.max.x)),
+ make_int4(__float_as_int(b0.min.y),
+ __float_as_int(b1.min.y),
+ __float_as_int(b0.max.y),
+ __float_as_int(b1.max.y)),
+ make_int4(__float_as_int(b0.min.z),
+ __float_as_int(b1.min.z),
+ __float_as_int(b0.max.z),
+ __float_as_int(b1.max.z)),
+ };
+
+ memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
+}
+
+void BVH2::pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1)
+{
+ pack_unaligned_node(e.idx,
+ e0.node->get_aligned_space(),
+ e1.node->get_aligned_space(),
+ e0.node->bounds,
+ e1.node->bounds,
+ e0.encodeIdx(), e1.encodeIdx(),
+ e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_unaligned_node(int idx,
+ const Transform& aligned_space0,
+ const Transform& aligned_space1,
+ const BoundBox& bounds0,
+ const BoundBox& bounds1,
+ int c0, int c1,
+ uint visibility0, uint visibility1)
+{
+ assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
+ assert(c0 < 0 || c0 < pack.nodes.size());
+ assert(c1 < 0 || c1 < pack.nodes.size());
+
+ float4 data[BVH_UNALIGNED_NODE_SIZE];
+ Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+ aligned_space0);
+ Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+ aligned_space1);
+ data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
+ __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
+ __int_as_float(c0),
+ __int_as_float(c1));
+
+ data[1] = space0.x;
+ data[2] = space0.y;
+ data[3] = space0.z;
+ data[4] = space1.x;
+ data[5] = space1.y;
+ data[6] = space1.z;
+
+ memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
+}
+
+void BVH2::pack_nodes(const BVHNode *root)
+{
+ const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+ const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+ assert(num_leaf_nodes <= num_nodes);
+ const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+ size_t node_size;
+ if(params.use_unaligned_nodes) {
+ const size_t num_unaligned_nodes =
+ root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
+ node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
+ (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
+ }
+ else {
+ node_size = num_inner_nodes * BVH_NODE_SIZE;
+ }
+ /* Resize arrays */
+ pack.nodes.clear();
+ pack.leaf_nodes.clear();
+ /* For top level BVH, first merge existing BVH's so we know the offsets. */
+ if(params.top_level) {
+ pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+ }
+ else {
+ pack.nodes.resize(node_size);
+ pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+ }
+
+ int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+ vector<BVHStackEntry> stack;
+ stack.reserve(BVHParams::MAX_DEPTH*2);
+ if(root->is_leaf()) {
+ stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+ }
+ else {
+ stack.push_back(BVHStackEntry(root, nextNodeIdx));
+ nextNodeIdx += node_bvh_is_unaligned(root)
+ ? BVH_UNALIGNED_NODE_SIZE
+ : BVH_NODE_SIZE;
+ }
+
+ while(stack.size()) {
+ BVHStackEntry e = stack.back();
+ stack.pop_back();
+
+ if(e.node->is_leaf()) {
+ /* leaf node */
+ const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+ pack_leaf(e, leaf);
+ }
+ else {
+ /* innner node */
+ int idx[2];
+ for(int i = 0; i < 2; ++i) {
+ if(e.node->get_child(i)->is_leaf()) {
+ idx[i] = nextLeafNodeIdx++;
+ }
+ else {
+ idx[i] = nextNodeIdx;
+ nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+ ? BVH_UNALIGNED_NODE_SIZE
+ : BVH_NODE_SIZE;
+ }
+ }
+
+ stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
+ stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
+
+ pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
+ }
+ }
+ assert(node_size == nextNodeIdx);
+ /* root index to start traversal at, to handle case of single leaf node */
+ pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH2::refit_nodes()
+{
+ assert(!params.top_level);
+
+ BoundBox bbox = BoundBox::empty;
+ uint visibility = 0;
+ refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+ if(leaf) {
+ assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+ const int4 *data = &pack.leaf_nodes[idx];
+ const int c0 = data[0].x;
+ const int c1 = data[0].y;
+ /* refit leaf node */
+ for(int prim = c0; prim < c1; prim++) {
+ int pidx = pack.prim_index[prim];
+ int tob = pack.prim_object[prim];
+ Object *ob = objects[tob];
+
+ if(pidx == -1) {
+ /* object instance */
+ bbox.grow(ob->bounds);
+ }
+ else {
+ /* primitives */
+ const Mesh *mesh = ob->mesh;
+
+ if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+ /* curves */
+ int str_offset = (params.top_level)? mesh->curve_offset: 0;
+ Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+ int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+ curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+ visibility |= PATH_RAY_CURVE;
+
+ /* motion curves */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->curve_keys.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *key_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++)
+ curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+ }
+ }
+ }
+ else {
+ /* triangles */
+ int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+ Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+ const float3 *vpos = &mesh->verts[0];
+
+ triangle.bounds_grow(vpos, bbox);
+
+ /* motion triangles */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->verts.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *vert_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++)
+ triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+ }
+ }
+ }
+ }
+
+ visibility |= ob->visibility;
+ }
+
+ /* TODO(sergey): De-duplicate with pack_leaf(). */
+ float4 leaf_data[BVH_NODE_LEAF_SIZE];
+ leaf_data[0].x = __int_as_float(c0);
+ leaf_data[0].y = __int_as_float(c1);
+ leaf_data[0].z = __uint_as_float(visibility);
+ leaf_data[0].w = __uint_as_float(data[0].w);
+ memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+ }
+ else {
+ assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+
+ const int4 *data = &pack.nodes[idx];
+ const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+ const int c0 = data[0].z;
+ const int c1 = data[0].w;
+ /* refit inner node, set bbox from children */
+ BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
+ uint visibility0 = 0, visibility1 = 0;
+
+ refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
+ refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
+
+ if(is_unaligned) {
+ Transform aligned_space = transform_identity();
+ pack_unaligned_node(idx,
+ aligned_space, aligned_space,
+ bbox0, bbox1,
+ c0, c1,
+ visibility0,
+ visibility1);
+ }
+ else {
+ pack_aligned_node(idx,
+ bbox0, bbox1,
+ c0, c1,
+ visibility0,
+ visibility1);
+ }
+
+ bbox.grow(bbox0);
+ bbox.grow(bbox1);
+ visibility = visibility0|visibility1;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
new file mode 100644
index 00000000000..df65ddca5b7
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH2_H__
+#define __BVH2_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_NODE_SIZE 4
+#define BVH_NODE_LEAF_SIZE 1
+#define BVH_UNALIGNED_NODE_SIZE 7
+
+/* BVH2
+ *
+ * Typical BVH with each node having two children.
+ */
+class BVH2 : public BVH {
+protected:
+ /* constructor */
+ friend class BVH;
+ BVH2(const BVHParams& params, const vector<Object*>& objects);
+
+ /* pack */
+ void pack_nodes(const BVHNode *root);
+
+ void pack_leaf(const BVHStackEntry& e,
+ const LeafNode *leaf);
+ void pack_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1);
+
+ void pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1);
+ void pack_aligned_node(int idx,
+ const BoundBox& b0,
+ const BoundBox& b1,
+ int c0, int c1,
+ uint visibility0, uint visibility1);
+
+ void pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry& e0,
+ const BVHStackEntry& e1);
+ void pack_unaligned_node(int idx,
+ const Transform& aligned_space0,
+ const Transform& aligned_space1,
+ const BoundBox& b0,
+ const BoundBox& b1,
+ int c0, int c1,
+ uint visibility0, uint visibility1);
+
+ /* refit */
+ void refit_nodes();
+ void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH2_H__ */
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
new file mode 100644
index 00000000000..5034ab811d5
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -0,0 +1,516 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh4.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Can we avoid this somehow or make more generic?
+ *
+ * Perhaps we can merge nodes in actual tree and make our
+ * life easier all over the place.
+ */
+static bool node_qbvh_is_unaligned(const BVHNode *node)
+{
+ const BVHNode *node0 = node->get_child(0),
+ *node1 = node->get_child(1);
+ bool has_unaligned = false;
+ if(node0->is_leaf()) {
+ has_unaligned |= node0->is_unaligned;
+ }
+ else {
+ has_unaligned |= node0->get_child(0)->is_unaligned;
+ has_unaligned |= node0->get_child(1)->is_unaligned;
+ }
+ if(node1->is_leaf()) {
+ has_unaligned |= node1->is_unaligned;
+ }
+ else {
+ has_unaligned |= node1->get_child(0)->is_unaligned;
+ has_unaligned |= node1->get_child(1)->is_unaligned;
+ }
+ return has_unaligned;
+}
+
+BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+ params.use_qbvh = true;
+}
+
+void BVH4::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+ float4 data[BVH_QNODE_LEAF_SIZE];
+ memset(data, 0, sizeof(data));
+ if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+ /* object */
+ data[0].x = __int_as_float(~(leaf->lo));
+ data[0].y = __int_as_float(0);
+ }
+ else {
+ /* triangle */
+ data[0].x = __int_as_float(leaf->lo);
+ data[0].y = __int_as_float(leaf->hi);
+ }
+ data[0].z = __uint_as_float(leaf->visibility);
+ if(leaf->num_triangles() != 0) {
+ data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+ }
+
+ memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+}
+
+void BVH4::pack_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ bool has_unaligned = false;
+ /* Check whether we have to create unaligned node or all nodes are aligned
+ * and we can cut some corner here.
+ */
+ if(params.use_unaligned_nodes) {
+ for(int i = 0; i < num; i++) {
+ if(en[i].node->is_unaligned) {
+ has_unaligned = true;
+ break;
+ }
+ }
+ }
+ if(has_unaligned) {
+ /* There's no unaligned children, pack into AABB node. */
+ pack_unaligned_inner(e, en, num);
+ }
+ else {
+ /* Create unaligned node with orientation transform for each of the
+ * children.
+ */
+ pack_aligned_inner(e, en, num);
+ }
+}
+
+void BVH4::pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ BoundBox bounds[4];
+ int child[4];
+ for(int i = 0; i < num; ++i) {
+ bounds[i] = en[i].node->bounds;
+ child[i] = en[i].encodeIdx();
+ }
+ pack_aligned_node(e.idx,
+ bounds,
+ child,
+ e.node->visibility,
+ e.node->time_from,
+ e.node->time_to,
+ num);
+}
+
+void BVH4::pack_aligned_node(int idx,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num)
+{
+ float4 data[BVH_QNODE_SIZE];
+ memset(data, 0, sizeof(data));
+
+ data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+ data[0].y = time_from;
+ data[0].z = time_to;
+
+ for(int i = 0; i < num; i++) {
+ float3 bb_min = bounds[i].min;
+ float3 bb_max = bounds[i].max;
+
+ data[1][i] = bb_min.x;
+ data[2][i] = bb_max.x;
+ data[3][i] = bb_min.y;
+ data[4][i] = bb_max.y;
+ data[5][i] = bb_min.z;
+ data[6][i] = bb_max.z;
+
+ data[7][i] = __int_as_float(child[i]);
+ }
+
+ for(int i = num; i < 4; i++) {
+ /* We store BB which would never be recorded as intersection
+ * so kernel might safely assume there are always 4 child nodes.
+ */
+ data[1][i] = FLT_MAX;
+ data[2][i] = -FLT_MAX;
+
+ data[3][i] = FLT_MAX;
+ data[4][i] = -FLT_MAX;
+
+ data[5][i] = FLT_MAX;
+ data[6][i] = -FLT_MAX;
+
+ data[7][i] = __int_as_float(0);
+ }
+
+ memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
+}
+
+void BVH4::pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ Transform aligned_space[4];
+ BoundBox bounds[4];
+ int child[4];
+ for(int i = 0; i < num; ++i) {
+ aligned_space[i] = en[i].node->get_aligned_space();
+ bounds[i] = en[i].node->bounds;
+ child[i] = en[i].encodeIdx();
+ }
+ pack_unaligned_node(e.idx,
+ aligned_space,
+ bounds,
+ child,
+ e.node->visibility,
+ e.node->time_from,
+ e.node->time_to,
+ num);
+}
+
+void BVH4::pack_unaligned_node(int idx,
+ const Transform *aligned_space,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num)
+{
+ float4 data[BVH_UNALIGNED_QNODE_SIZE];
+ memset(data, 0, sizeof(data));
+
+ data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+ data[0].y = time_from;
+ data[0].z = time_to;
+
+ for(int i = 0; i < num; i++) {
+ Transform space = BVHUnaligned::compute_node_transform(
+ bounds[i],
+ aligned_space[i]);
+
+ data[1][i] = space.x.x;
+ data[2][i] = space.x.y;
+ data[3][i] = space.x.z;
+
+ data[4][i] = space.y.x;
+ data[5][i] = space.y.y;
+ data[6][i] = space.y.z;
+
+ data[7][i] = space.z.x;
+ data[8][i] = space.z.y;
+ data[9][i] = space.z.z;
+
+ data[10][i] = space.x.w;
+ data[11][i] = space.y.w;
+ data[12][i] = space.z.w;
+
+ data[13][i] = __int_as_float(child[i]);
+ }
+
+ for(int i = num; i < 4; i++) {
+ /* We store BB which would never be recorded as intersection
+ * so kernel might safely assume there are always 4 child nodes.
+ */
+
+ data[1][i] = 1.0f;
+ data[2][i] = 0.0f;
+ data[3][i] = 0.0f;
+
+ data[4][i] = 0.0f;
+ data[5][i] = 0.0f;
+ data[6][i] = 0.0f;
+
+ data[7][i] = 0.0f;
+ data[8][i] = 0.0f;
+ data[9][i] = 0.0f;
+
+ data[10][i] = -FLT_MAX;
+ data[11][i] = -FLT_MAX;
+ data[12][i] = -FLT_MAX;
+
+ data[13][i] = __int_as_float(0);
+ }
+
+ memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH4::pack_nodes(const BVHNode *root)
+{
+ /* Calculate size of the arrays required. */
+ const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+ const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+ assert(num_leaf_nodes <= num_nodes);
+ const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+ size_t node_size;
+ if(params.use_unaligned_nodes) {
+ const size_t num_unaligned_nodes =
+ root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
+ node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
+ (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
+ }
+ else {
+ node_size = num_inner_nodes * BVH_QNODE_SIZE;
+ }
+ /* Resize arrays. */
+ pack.nodes.clear();
+ pack.leaf_nodes.clear();
+ /* For top level BVH, first merge existing BVH's so we know the offsets. */
+ if(params.top_level) {
+ pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+ }
+ else {
+ pack.nodes.resize(node_size);
+ pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+ }
+
+ int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+ vector<BVHStackEntry> stack;
+ stack.reserve(BVHParams::MAX_DEPTH*2);
+ if(root->is_leaf()) {
+ stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+ }
+ else {
+ stack.push_back(BVHStackEntry(root, nextNodeIdx));
+ nextNodeIdx += node_qbvh_is_unaligned(root)
+ ? BVH_UNALIGNED_QNODE_SIZE
+ : BVH_QNODE_SIZE;
+ }
+
+ while(stack.size()) {
+ BVHStackEntry e = stack.back();
+ stack.pop_back();
+
+ if(e.node->is_leaf()) {
+ /* leaf node */
+ const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+ pack_leaf(e, leaf);
+ }
+ else {
+ /* Inner node. */
+ const BVHNode *node = e.node;
+ const BVHNode *node0 = node->get_child(0);
+ const BVHNode *node1 = node->get_child(1);
+ /* Collect nodes. */
+ const BVHNode *nodes[4];
+ int numnodes = 0;
+ if(node0->is_leaf()) {
+ nodes[numnodes++] = node0;
+ }
+ else {
+ nodes[numnodes++] = node0->get_child(0);
+ nodes[numnodes++] = node0->get_child(1);
+ }
+ if(node1->is_leaf()) {
+ nodes[numnodes++] = node1;
+ }
+ else {
+ nodes[numnodes++] = node1->get_child(0);
+ nodes[numnodes++] = node1->get_child(1);
+ }
+ /* Push entries on the stack. */
+ for(int i = 0; i < numnodes; ++i) {
+ int idx;
+ if(nodes[i]->is_leaf()) {
+ idx = nextLeafNodeIdx++;
+ }
+ else {
+ idx = nextNodeIdx;
+ nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+ ? BVH_UNALIGNED_QNODE_SIZE
+ : BVH_QNODE_SIZE;
+ }
+ stack.push_back(BVHStackEntry(nodes[i], idx));
+ }
+ /* Set node. */
+ pack_inner(e, &stack[stack.size()-numnodes], numnodes);
+ }
+ }
+ assert(node_size == nextNodeIdx);
+ /* Root index to start traversal at, to handle case of single leaf node. */
+ pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH4::refit_nodes()
+{
+ assert(!params.top_level);
+
+ BoundBox bbox = BoundBox::empty;
+ uint visibility = 0;
+ refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+ if(leaf) {
+ int4 *data = &pack.leaf_nodes[idx];
+ int4 c = data[0];
+ /* Refit leaf node. */
+ for(int prim = c.x; prim < c.y; prim++) {
+ int pidx = pack.prim_index[prim];
+ int tob = pack.prim_object[prim];
+ Object *ob = objects[tob];
+
+ if(pidx == -1) {
+ /* Object instance. */
+ bbox.grow(ob->bounds);
+ }
+ else {
+ /* Primitives. */
+ const Mesh *mesh = ob->mesh;
+
+ if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+ /* Curves. */
+ int str_offset = (params.top_level)? mesh->curve_offset: 0;
+ Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+ int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+ curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+ visibility |= PATH_RAY_CURVE;
+
+ /* Motion curves. */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->curve_keys.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *key_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++)
+ curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+ }
+ }
+ }
+ else {
+ /* Triangles. */
+ int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+ Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+ const float3 *vpos = &mesh->verts[0];
+
+ triangle.bounds_grow(vpos, bbox);
+
+ /* Motion triangles. */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->verts.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *vert_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++)
+ triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+ }
+ }
+ }
+ }
+
+ visibility |= ob->visibility;
+ }
+
+ /* TODO(sergey): This is actually a copy of pack_leaf(),
+ * but this chunk of code only knows actual data and has
+ * no idea about BVHNode.
+ *
+ * Would be nice to de-duplicate code, but trying to make
+ * making code more general ends up in much nastier code
+ * in my opinion so far.
+ *
+ * Same applies to the inner nodes case below.
+ */
+ float4 leaf_data[BVH_QNODE_LEAF_SIZE];
+ leaf_data[0].x = __int_as_float(c.x);
+ leaf_data[0].y = __int_as_float(c.y);
+ leaf_data[0].z = __uint_as_float(visibility);
+ leaf_data[0].w = __uint_as_float(c.w);
+ memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+ }
+ else {
+ int4 *data = &pack.nodes[idx];
+ bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+ int4 c;
+ if(is_unaligned) {
+ c = data[13];
+ }
+ else {
+ c = data[7];
+ }
+ /* Refit inner node, set bbox from children. */
+ BoundBox child_bbox[4] = {BoundBox::empty,
+ BoundBox::empty,
+ BoundBox::empty,
+ BoundBox::empty};
+ uint child_visibility[4] = {0};
+ int num_nodes = 0;
+
+ for(int i = 0; i < 4; ++i) {
+ if(c[i] != 0) {
+ refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+ child_bbox[i], child_visibility[i]);
+ ++num_nodes;
+ bbox.grow(child_bbox[i]);
+ visibility |= child_visibility[i];
+ }
+ }
+
+ if(is_unaligned) {
+ Transform aligned_space[4] = {transform_identity(),
+ transform_identity(),
+ transform_identity(),
+ transform_identity()};
+ pack_unaligned_node(idx,
+ aligned_space,
+ child_bbox,
+ &c[0],
+ visibility,
+ 0.0f,
+ 1.0f,
+ 4);
+ }
+ else {
+ pack_aligned_node(idx,
+ child_bbox,
+ &c[0],
+ visibility,
+ 0.0f,
+ 1.0f,
+ 4);
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
new file mode 100644
index 00000000000..310909a37e1
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH4_H__
+#define __BVH4_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_QNODE_SIZE 8
+#define BVH_QNODE_LEAF_SIZE 1
+#define BVH_UNALIGNED_QNODE_SIZE 14
+
+/* BVH4
+ *
+ * Quad BVH, with each node having four children, to use with SIMD instructions.
+ */
+class BVH4 : public BVH {
+protected:
+ /* constructor */
+ friend class BVH;
+ BVH4(const BVHParams& params, const vector<Object*>& objects);
+
+ /* pack */
+ void pack_nodes(const BVHNode *root);
+
+ void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+ void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+ void pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num);
+ void pack_aligned_node(int idx,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num);
+
+ void pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num);
+ void pack_unaligned_node(int idx,
+ const Transform *aligned_space,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num);
+
+ /* refit */
+ void refit_nodes();
+ void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 3226008f511..63a7fc11668 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -17,10 +17,10 @@
//#define __KERNEL_SSE__
-#include <stdlib.h>
-
#include "bvh/bvh_binning.h"
+#include <stdlib.h>
+
#include "util/util_algorithm.h"
#include "util/util_boundbox.h"
#include "util/util_types.h"
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 285f9c56a62..c2e259b1696 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -111,5 +111,4 @@ protected:
CCL_NAMESPACE_END
-#endif
-
+#endif /* __BVH_BINNING_H__ */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 95c71b54da0..1880964355c 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,8 +15,9 @@
* limitations under the License.
*/
-#include "bvh/bvh_binning.h"
#include "bvh/bvh_build.h"
+
+#include "bvh/bvh_binning.h"
#include "bvh/bvh_node.h"
#include "bvh/bvh_params.h"
#include "bvh_split.h"
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 5733708050d..7b245139819 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,17 +20,17 @@
#include <float.h>
-#include "bvh/bvh.h"
-#include "bvh/bvh_binning.h"
+#include "bvh/bvh_params.h"
#include "bvh/bvh_unaligned.h"
-#include "util/util_boundbox.h"
#include "util/util_task.h"
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
+class Boundbox;
class BVHBuildTask;
+class BVHNode;
class BVHSpatialSplitBuildTask;
class BVHParams;
class InnerNode;
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 4f788c66797..4237c62ab5b 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,9 +15,10 @@
* limitations under the License.
*/
+#include "bvh/bvh_node.h"
+
#include "bvh/bvh.h"
#include "bvh/bvh_build.h"
-#include "bvh/bvh_node.h"
#include "util/util_debug.h"
#include "util/util_vector.h"
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 60511b4b012..1c875f5a524 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -19,7 +19,6 @@
#define __BVH_NODE_H__
#include "util/util_boundbox.h"
-#include "util/util_debug.h"
#include "util/util_types.h"
CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 9795a7a4350..7dd699b33a4 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -246,4 +246,3 @@ struct BVHSpatialStorage {
CCL_NAMESPACE_END
#endif /* __BVH_PARAMS_H__ */
-
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index d29629c0279..3a01061b285 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,9 +15,10 @@
* limitations under the License.
*/
-#include "bvh/bvh_build.h"
#include "bvh/bvh_sort.h"
+#include "bvh/bvh_build.h"
+
#include "util/util_algorithm.h"
#include "util/util_debug.h"
#include "util/util_task.h"
diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h
index b49ca02eb60..936401d8607 100644
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@@ -18,8 +18,11 @@
#ifndef __BVH_SORT_H__
#define __BVH_SORT_H__
+#include <cstddef>
+
CCL_NAMESPACE_BEGIN
+class BVHReference;
class BVHUnaligned;
struct Transform;
@@ -33,4 +36,3 @@ void bvh_reference_sort(int start,
CCL_NAMESPACE_END
#endif /* __BVH_SORT_H__ */
-
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index b10d69a495d..c55ba40b565 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,8 +15,9 @@
* limitations under the License.
*/
-#include "bvh/bvh_build.h"
#include "bvh/bvh_split.h"
+
+#include "bvh/bvh_build.h"
#include "bvh/bvh_sort.h"
#include "render/mesh.h"
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index ef227d20ea9..b522a8f3e10 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -14,7 +14,6 @@
* limitations under the License.
*/
-
#include "bvh/bvh_unaligned.h"
#include "render/mesh.h"
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index f41bae79e2b..c3ece051cd5 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -78,4 +78,3 @@ protected:
CCL_NAMESPACE_END
#endif /* __BVH_UNALIGNED_H__ */
-
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 403a0540963..df88b91f5ac 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -135,13 +135,5 @@ if(CYCLES_STANDALONE_REPOSITORY)
unset(_lib_DIR)
else()
- if(WIN32)
- set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src/windows)
- set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
- else()
- set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src)
- set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
- endif()
- set(GFLAGS_NAMESPACE "gflags")
set(LLVM_LIBRARIES ${LLVM_LIBRARY})
endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6ef2aa1caad..74ec57ddf74 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
device.cpp
device_cpu.cpp
device_cuda.cpp
+ device_denoising.cpp
device_multi.cpp
device_opencl.cpp
device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()
set(SRC_HEADERS
device.h
+ device_denoising.h
device_memory.h
device_intern.h
device_network.h
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 968af447e29..a54bb77f9f3 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -66,6 +66,10 @@ std::ostream& operator <<(std::ostream &os,
<< string_from_bool(requested_features.use_patch_evaluation) << std::endl;
os << "Use Transparent Shadows: "
<< string_from_bool(requested_features.use_transparent) << std::endl;
+ os << "Use Principled BSDF: "
+ << string_from_bool(requested_features.use_principled) << std::endl;
+ os << "Use Denoising: "
+ << string_from_bool(requested_features.use_denoising) << std::endl;
return os;
}
@@ -400,4 +404,16 @@ void Device::free_memory()
devices.free_memory();
}
+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+ ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+ device->mem_free_sub_ptr(ptr);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ac06e561795..b3b693c630c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -124,6 +124,12 @@ public:
/* Use various shadow tricks, such as shadow catcher. */
bool use_shadow_tricks;
+ /* Per-uber shader usage flags. */
+ bool use_principled;
+
+ /* Denoising features. */
+ bool use_denoising;
+
DeviceRequestedFeatures()
{
/* TODO(sergey): Find more meaningful defaults. */
@@ -141,6 +147,8 @@ public:
use_patch_evaluation = false;
use_transparent = false;
use_shadow_tricks = false;
+ use_principled = false;
+ use_denoising = false;
}
bool modified(const DeviceRequestedFeatures& requested_features)
@@ -158,7 +166,9 @@ public:
use_integrator_branched == requested_features.use_integrator_branched &&
use_patch_evaluation == requested_features.use_patch_evaluation &&
use_transparent == requested_features.use_transparent &&
- use_shadow_tricks == requested_features.use_shadow_tricks);
+ use_shadow_tricks == requested_features.use_shadow_tricks &&
+ use_principled == requested_features.use_principled &&
+ use_denoising == requested_features.use_denoising);
}
/* Convert the requested features structure to a build options,
@@ -205,6 +215,12 @@ public:
if(!use_shadow_tricks) {
build_options += " -D__NO_SHADOW_TRICKS__";
}
+ if(!use_principled) {
+ build_options += " -D__NO_PRINCIPLED__";
+ }
+ if(!use_denoising) {
+ build_options += " -D__NO_DENOISING__";
+ }
return build_options;
}
};
@@ -220,6 +236,7 @@ struct DeviceDrawParams {
};
class Device {
+ friend class device_sub_ptr;
protected:
Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
@@ -229,6 +246,14 @@ protected:
/* used for real time display */
unsigned int vertex_buffer;
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+ {
+ /* Only required for devices that implement denoising. */
+ assert(false);
+ return (device_ptr) 0;
+ }
+ virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
public:
virtual ~Device();
@@ -257,6 +282,8 @@ public:
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
+ virtual int mem_address_alignment() { return 16; }
+
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
@@ -304,6 +331,8 @@ public:
/* multi device */
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
virtual int device_number(Device * /*sub_device*/) { return 0; }
+ virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+ virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
/* static */
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 3c481bb2b39..18112437b45 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
#endif
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
@@ -34,6 +35,8 @@
#include "kernel/split/kernel_split_data.h"
#include "kernel/kernel_globals.h"
+#include "kernel/filter/filter.h"
+
#include "kernel/osl/osl_shader.h"
#include "kernel/osl/osl_globals.h"
@@ -53,91 +56,108 @@ CCL_NAMESPACE_BEGIN
class CPUDevice;
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
- static unordered_map<string, void*> kernel_functions;
-
- static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+ KernelFunctions()
{
- kernel_functions[name] = func;
+ kernel = (F)NULL;
}
- static const char* get_arch_name()
+ KernelFunctions(F kernel_default,
+ F kernel_sse2,
+ F kernel_sse3,
+ F kernel_sse41,
+ F kernel_avx,
+ F kernel_avx2)
{
+ const char *architecture_name = "default";
+ kernel = kernel_default;
+
+ /* Silence potential warnings about unused variables
+ * when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
- return "cpu_avx2";
+ architecture_name = "AVX2";
+ kernel = kernel_avx2;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
- return "cpu_avx";
+ architecture_name = "AVX";
+ kernel = kernel_avx;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
- return "cpu_sse41";
+ architecture_name = "SSE4.1";
+ kernel = kernel_sse41;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
- return "cpu_sse3";
+ architecture_name = "SSE3";
+ kernel = kernel_sse3;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
- return "cpu_sse2";
+ architecture_name = "SSE2";
+ kernel = kernel_sse2;
}
- else
#endif
- {
- return "cpu";
+
+ if(strstr(architecture_name, logged_architecture) != 0) {
+ VLOG(1) << "Will be using " << architecture_name << " kernels.";
+ logged_architecture = architecture_name;
}
}
- template<typename F>
- static F get_kernel_function(string name)
- {
- name = string("kernel_") + get_arch_name() + "_" + name;
-
- unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+ inline F operator()() const {
+ assert(kernel);
+ return kernel;
+ }
+protected:
+ F kernel;
+};
- if(it == kernel_functions.end()) {
- assert(!"kernel function not found");
- return NULL;
- }
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
- return (F)it->second;
- }
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
- friend class CPUSplitKernel;
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+class CPUDevice : public Device
+{
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -149,77 +169,92 @@ public:
bool use_split_kernel;
DeviceRequestedFeatures requested_features;
-
+
+ KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+
+ KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+
+ KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+ ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+ ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
+ unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), \
+ KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), \
+ KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), \
+ KERNEL_NAME_EVAL(cpu_avx2, name)
+
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
- : Device(info, stats, background)
+ : Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+ REGISTER_KERNEL(path_trace),
+ REGISTER_KERNEL(convert_to_half_float),
+ REGISTER_KERNEL(convert_to_byte),
+ REGISTER_KERNEL(shader),
+ REGISTER_KERNEL(filter_divide_shadow),
+ REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_detect_outliers),
+ REGISTER_KERNEL(filter_combine_halves),
+ REGISTER_KERNEL(filter_nlm_calc_difference),
+ REGISTER_KERNEL(filter_nlm_blur),
+ REGISTER_KERNEL(filter_nlm_calc_weight),
+ REGISTER_KERNEL(filter_nlm_update_output),
+ REGISTER_KERNEL(filter_nlm_normalize),
+ REGISTER_KERNEL(filter_construct_transform),
+ REGISTER_KERNEL(filter_nlm_construct_gramian),
+ REGISTER_KERNEL(filter_finalize),
+ REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
-
- /* do now to avoid thread issues */
- system_cpu_support_sse2();
- system_cpu_support_sse3();
- system_cpu_support_sse41();
- system_cpu_support_avx();
- system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- VLOG(1) << "Will be using AVX2 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- VLOG(1) << "Will be using AVX kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- VLOG(1) << "Will be using SSE4.1 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- VLOG(1) << "Will be using SSE3kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- VLOG(1) << "Will be using SSE2 kernels.";
- }
- else
-#endif
- {
- VLOG(1) << "Will be using regular kernels.";
- }
-
use_split_kernel = DebugFlags().cpu.split_kernel;
if(use_split_kernel) {
VLOG(1) << "Will be using split kernel.";
}
- kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+ REGISTER_SPLIT_KERNEL(path_init);
+ REGISTER_SPLIT_KERNEL(scene_intersect);
+ REGISTER_SPLIT_KERNEL(lamp_emission);
+ REGISTER_SPLIT_KERNEL(do_volume);
+ REGISTER_SPLIT_KERNEL(queue_enqueue);
+ REGISTER_SPLIT_KERNEL(indirect_background);
+ REGISTER_SPLIT_KERNEL(shader_setup);
+ REGISTER_SPLIT_KERNEL(shader_sort);
+ REGISTER_SPLIT_KERNEL(shader_eval);
+ REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ REGISTER_SPLIT_KERNEL(subsurface_scatter);
+ REGISTER_SPLIT_KERNEL(direct_lighting);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+ REGISTER_SPLIT_KERNEL(enqueue_inactive);
+ REGISTER_SPLIT_KERNEL(next_iteration_setup);
+ REGISTER_SPLIT_KERNEL(indirect_subsurface);
+ REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
}
~CPUDevice()
@@ -273,13 +308,17 @@ public:
if(!mem.data_pointer) {
free((void*)mem.device_pointer);
}
-
mem.device_pointer = 0;
stats.mem_free(mem.device_size);
mem.device_size = 0;
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +365,8 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
- if(!use_split_kernel) {
- thread_path_trace(*task);
- }
- else {
- thread_path_trace_split(*task);
- }
+ if(task->type == DeviceTask::RENDER) {
+ thread_render(*task);
}
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
@@ -349,117 +383,335 @@ public:
}
};
- void thread_path_trace(DeviceTask& task)
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
{
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- return;
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
}
- KernelGlobals kg = thread_kernel_globals_init();
- RenderTile tile;
+ return true;
+ }
- void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ int4 rect = task->rect;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+ float *difference = (float*) task->nlm_state.temporary_2_ptr;
+ float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
+
+ memset(weightAccum, 0, sizeof(float)*w*h);
+ memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) variance_ptr,
+ difference,
+ local_rect,
+ w, 0,
+ a, k_2);
+
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+
+ filter_nlm_update_output_kernel()(dx, dy,
+ blurDifference,
+ (float*) image_ptr,
+ (float*) out_ptr,
+ weightAccum,
+ local_rect,
+ w, f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+ filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- path_trace_kernel = kernel_cpu_avx2_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- path_trace_kernel = kernel_cpu_avx_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- path_trace_kernel = kernel_cpu_sse41_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- path_trace_kernel = kernel_cpu_sse3_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- path_trace_kernel = kernel_cpu_sse2_path_trace;
+ return true;
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ y*task->filter_area.z + x,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ &task->rect.x,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+ }
}
- else
-#endif
- {
- path_trace_kernel = kernel_cpu_path_trace;
+ return true;
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
+ float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) color_ptr,
+ (float*) color_variance_ptr,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ 1.0f,
+ task->nlm_k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_construct_gramian_kernel()(dx, dy,
+ blurDifference,
+ (float*) task->buffer.mem.device_pointer,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ local_rect,
+ &task->reconstruction_state.filter_rect.x,
+ task->buffer.w,
+ task->buffer.h,
+ 4,
+ task->buffer.pass_stride);
+ }
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_finalize_kernel()(x,
+ y,
+ y*task->filter_area.z + x,
+ task->buffer.w,
+ task->buffer.h,
+ (float*) output_ptr,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.buffer_params.x,
+ task->render_buffer.samples);
+ }
}
+ return true;
+ }
- while(task.acquire_tile(this, tile)) {
- float *render_buffer = (float*)tile.buffer;
- uint *rng_state = (uint*)tile.rng_state;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task.get_cancel() || task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask * /*task*/)
+ {
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ filter_combine_halves_kernel()(x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ &rect.x,
+ r);
+ }
+ }
+ return true;
+ }
- for(int y = tile.y; y < tile.y + tile.h; y++) {
- for(int x = tile.x; x < tile.x + tile.w; x++) {
- path_trace_kernel(&kg, render_buffer, rng_state,
- sample, x, y, tile.offset, tile.stride);
- }
- }
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_divide_shadow_kernel()(task->render_buffer.samples,
+ task->tiles,
+ x, y,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ (float*) sample_variance_ptr,
+ (float*) sv_variance_ptr,
+ (float*) buffer_variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
+ }
+ }
+ return true;
+ }
- tile.sample = sample + 1;
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_get_feature_kernel()(task->render_buffer.samples,
+ task->tiles,
+ mean_offset,
+ variance_offset,
+ x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
+ }
+ }
+ return true;
+ }
- task.update_progress(&tile, tile.w*tile.h);
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_detect_outliers_kernel()(x, y,
+ (float*) image_ptr,
+ (float*) variance_ptr,
+ (float*) depth_ptr,
+ (float*) output_ptr,
+ &task->rect.x,
+ task->buffer.pass_stride);
}
+ }
+ return true;
+ }
- task.release_tile(tile);
+ void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+ {
+ float *render_buffer = (float*)tile.buffer;
+ uint *rng_state = (uint*)tile.rng_state;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- if(task_pool.canceled()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ path_trace_kernel()(kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile, tile.w*tile.h);
}
+ }
+
+ void denoise(DeviceTask &task, RenderTile &tile)
+ {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ DenoisingTask denoising(this);
- thread_kernel_globals_free(&kg);
+ denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+ denoising.render_buffer.samples = tile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = tile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+
+ task.update_progress(&tile, tile.w*tile.h);
}
- void thread_path_trace_split(DeviceTask& task)
+ void thread_render(DeviceTask& task)
{
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
return;
}
- RenderTile tile;
-
- CPUSplitKernel split_kernel(this);
-
/* allocate buffer for kernel globals */
- device_memory kgbuffer;
- kgbuffer.resize(sizeof(KernelGlobals));
+ device_only_memory<KernelGlobals> kgbuffer;
+ kgbuffer.resize(1);
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
- KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
- *kg = thread_kernel_globals_init();
+ KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
- requested_features.max_closure = MAX_CLOSURE;
- if(!split_kernel.load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- mem_free(kgbuffer);
+ CPUSplitKernel *split_kernel = NULL;
+ if(use_split_kernel) {
+ split_kernel = new CPUSplitKernel(this);
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel->load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
- return;
+ delete split_kernel;
+ return;
+ }
}
+ RenderTile tile;
while(task.acquire_tile(this, tile)) {
- device_memory data;
- split_kernel.path_trace(&task, tile, kgbuffer, data);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel) {
+ device_memory data;
+ split_kernel->path_trace(&task, tile, kgbuffer, data);
+ }
+ else {
+ path_trace(task, tile, kg);
+ }
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ denoise(task, tile);
+ }
task.release_tile(tile);
@@ -470,7 +722,9 @@ public:
}
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ kg->~KernelGlobals();
mem_free(kgbuffer);
+ delete split_kernel;
}
void thread_film_convert(DeviceTask& task)
@@ -478,86 +732,16 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
- void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
- }
- else
-#endif
- {
- convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
else {
- void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
- }
- else
-#endif
- {
- convert_to_byte_kernel = kernel_cpu_convert_to_byte;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
}
@@ -569,53 +753,17 @@ public:
#ifdef WITH_OSL
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- shader_kernel = kernel_cpu_avx2_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- shader_kernel = kernel_cpu_avx_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- shader_kernel = kernel_cpu_sse41_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- shader_kernel = kernel_cpu_sse3_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- shader_kernel = kernel_cpu_sse2_shader;
- }
- else
-#endif
- {
- shader_kernel = kernel_cpu_shader;
- }
-
for(int sample = 0; sample < task.num_samples; sample++) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel(&kg,
- (uint4*)task.shader_input,
- (float4*)task.shader_output,
- (float*)task.shader_output_luma,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
+ shader_kernel()(&kg,
+ (uint4*)task.shader_input,
+ (float4*)task.shader_output,
+ (float*)task.shader_output_luma,
+ task.shader_eval_type,
+ task.shader_filter,
+ x,
+ task.offset,
+ sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -752,58 +900,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
device_memory& use_queues_flags,
device_memory& work_pool_wgs)
{
- typedef void(*data_init_t)(KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- ccl_global uint *rng_state,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer);
-
- data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- data_init = kernel_cpu_avx2_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- data_init = kernel_cpu_avx_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- data_init = kernel_cpu_sse41_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- data_init = kernel_cpu_sse3_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- data_init = kernel_cpu_sse2_data_init;
- }
- else
-#endif
- {
- data_init = kernel_cpu_data_init;
- }
-
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
@@ -811,37 +907,38 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
- data_init((KernelGlobals*)kernel_globals.device_pointer,
- (KernelData*)data.device_pointer,
- (void*)split_data.device_pointer,
- num_global_elements,
- (char*)ray_state.device_pointer,
- (uint*)rtile.rng_state,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int*)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char*)use_queues_flags.device_pointer,
- (uint*)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float*)rtile.buffer);
+ device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+ (KernelData*)data.device_pointer,
+ (void*)split_data.device_pointer,
+ num_global_elements,
+ (char*)ray_state.device_pointer,
+ (uint*)rtile.rng_state,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int*)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char*)use_queues_flags.device_pointer,
+ (uint*)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float*)rtile.buffer);
}
}
return true;
}
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&)
{
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
- kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+ kernel->func = device->split_kernels[kernel_name]();
if(!kernel->func) {
delete kernel;
return NULL;
@@ -865,8 +962,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
return split_data_buffer_size(kg, num_threads);
}
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index ef283c9d455..3a29538aa13 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
#include <string.h>
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
#include "render/buffers.h"
+#include "kernel/filter/filter_defines.h"
+
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
@@ -102,7 +105,8 @@ public:
device_memory& use_queues_flag,
device_memory& work_pool_wgs);
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
@@ -113,12 +117,13 @@ public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
- CUmodule cuModule;
+ CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
map<device_ptr, uint> tex_bindless_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
+ CUDASplitKernel *split_kernel;
struct PixelMem {
GLuint cuPBO;
@@ -169,7 +174,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -221,6 +226,11 @@ public:
cuDevice = 0;
cuContext = 0;
+ cuModule = 0;
+ cuFilterModule = 0;
+
+ split_kernel = NULL;
+
need_bindless_mapping = false;
/* intialize */
@@ -260,6 +270,8 @@ public:
{
task_pool.stop();
+ delete split_kernel;
+
if(info.has_bindless_textures) {
tex_free(bindless_mapping);
}
@@ -296,7 +308,8 @@ public:
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features, bool split=false)
+ const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -311,7 +324,7 @@ public:
machine,
cuda_version,
include_path.c_str());
- if(use_adaptive_compilation()) {
+ if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -359,8 +372,22 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
+ const char *name, *source;
+ if(filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if(split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -368,9 +395,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
- : "lib/kernel_sm_%d%d.cubin",
- major, minor));
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+ name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@@ -379,7 +405,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features, split);
+ compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
const string source_path = path_get("source");
@@ -390,9 +416,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
- : "cycles_kernel_sm%d%d_%s.cubin",
- major, minor,
+ const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+ name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -427,7 +452,7 @@ public:
const string kernel = path_join(
path_join(source_path, "kernel"),
path_join("kernels",
- path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+ path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -466,6 +491,16 @@ public:
bool load_kernels(const DeviceRequestedFeatures& requested_features)
{
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if(cuFilterModule && cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
/* check if cuda init succeeded */
if(cuContext == 0)
return false;
@@ -475,11 +510,14 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features, use_split_kernel());
-
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if(filter_cubin == "")
+ return false;
+
/* open module */
cuda_push_context();
@@ -494,6 +532,14 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+ if(path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if(cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
cuda_pop_context();
return (result == CUDA_SUCCESS);
@@ -576,6 +622,11 @@ public:
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@@ -876,6 +927,393 @@ public:
}
}
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+ {
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
+ }
+
+ mem_copy_to(task->tiles_mem);
+
+ return !have_error();
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ int4 rect = task->rect;
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+ int dx, dy;
+ int4 local_rect;
+ int channel_offset = 0;
+ void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+ task->storage.w,
+ task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cuda_push_context();
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ void *calc_difference_args[] = {&dx, &dy,
+ &color_ptr,
+ &color_variance_ptr,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &task->buffer.pass_stride,
+ &a,
+ &task->nlm_k_2};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+ void *blur_args[] = {&difference,
+ &blurDifference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *calc_weight_args[] = {&blurDifference,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+ /* Reuse previous arguments. */
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *construct_gramian_args[] = {&dx, &dy,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &local_rect,
+ &task->reconstruction_state.filter_rect,
+ &task->buffer.w,
+ &task->buffer.h,
+ &f,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ void *finalize_args[] = {&task->buffer.w,
+ &task->buffer.h,
+ &output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&mean_ptr,
+ &variance_ptr,
+ &a_ptr,
+ &b_ptr,
+ &rect,
+ &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDetectOutliers;
+ cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&image_ptr,
+ &variance_ptr,
+ &depth_ptr,
+ &output_ptr,
+ &task->rect,
+ &task->buffer.pass_stride};
+
+ CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, const DeviceTask &task)
+ {
+ DenoisingTask denoising(this);
+
+ denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+ }
+
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@@ -1300,7 +1738,7 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
+ if(task->type == DeviceTask::RENDER) {
RenderTile tile;
bool branched = task->integrator_branched;
@@ -1308,47 +1746,56 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- if(!use_split_kernel()) {
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ DeviceRequestedFeatures requested_features;
+ if(use_split_kernel()) {
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
+ }
+
+ if(split_kernel == NULL) {
+ split_kernel = new CUDASplitKernel(this);
+ split_kernel->load_kernels(requested_features);
+ }
+ }
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel()) {
+ device_memory void_buffer;
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ }
+ else {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- path_trace(tile, sample, branched);
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- tile.sample = sample + 1;
+ path_trace(tile, sample, branched);
- task->update_progress(&tile, tile.w*tile.h);
- }
+ tile.sample = sample + 1;
- task->release_tile(tile);
- }
- }
- else {
- DeviceRequestedFeatures requested_features;
- if(!use_adaptive_compilation()) {
- requested_features.max_closure = 64;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+ }
}
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
- CUDASplitKernel split_kernel(this);
- split_kernel.load_kernels(requested_features);
+ denoise(tile, *task);
- while(task->acquire_tile(this, tile)) {
- device_memory void_buffer;
- split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
- task->release_tile(tile);
+ task->release_tile(tile);
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
}
}
}
@@ -1591,7 +2038,8 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
return !device->have_error();
}
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&)
{
CUfunction func;
@@ -1627,7 +2075,8 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory&
<< string_human_readable_size(free) << ").";
size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
- int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
+ size_t side = round_down((int)sqrt(num_elements), 32);
+ int2 global_size = make_int2(side, round_down(num_elements / side, 16));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;
}
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..619cc1d171e
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+ radius = task.denoising_radius;
+ nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+ if(task.denoising_relative_pca) {
+ pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+ }
+ else {
+ pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+ }
+
+ render_buffer.pass_stride = task.pass_stride;
+ render_buffer.denoising_data_offset = task.pass_denoising_data;
+ render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+ /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+ rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+ max(tiles->y[0], filter_area.y - radius),
+ min(tiles->x[3], filter_area.x + filter_area.z + radius),
+ min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+ tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+ device_ptr buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = rtiles[i].buffer;
+ tiles->offsets[i] = rtiles[i].offset;
+ tiles->strides[i] = rtiles[i].stride;
+ }
+ tiles->x[0] = rtiles[3].x;
+ tiles->x[1] = rtiles[4].x;
+ tiles->x[2] = rtiles[5].x;
+ tiles->x[3] = rtiles[5].x + rtiles[5].w;
+ tiles->y[0] = rtiles[1].y;
+ tiles->y[1] = rtiles[4].y;
+ tiles->y[2] = rtiles[7].y;
+ tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+ render_buffer.offset = rtiles[4].offset;
+ render_buffer.stride = rtiles[4].stride;
+ render_buffer.ptr = rtiles[4].buffer;
+
+ functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+ /* Allocate denoising buffer. */
+ buffer.passes = 14;
+ buffer.w = align_up(rect.z - rect.x, 4);
+ buffer.h = rect.w - rect.y;
+ buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+ buffer.mem.resize(buffer.pass_stride * buffer.passes);
+ device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+ device_ptr null_ptr = (device_ptr) 0;
+
+ /* Prefilter shadow feature. */
+ {
+ device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+ /* Reuse memory, the previous data isn't needed anymore. */
+ device_ptr filtered_a = *buffer_var,
+ filtered_b = *sample_var;
+ /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+ functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+ device_ptr residual_var = *sample_var_var;
+ /* Estimate the residual variance between the two filtered halves. */
+ functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+ device_ptr final_a = *unfiltered_a,
+ final_b = *unfiltered_b;
+ /* Use the residual variance for a second filter pass. */
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+ functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+ /* Combine the two double-filtered halves to a final shadow feature. */
+ device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+ }
+
+ /* Prefilter general features. */
+ {
+ device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 };
+ int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
+ int pass_to[] = { 1, 2, 3, 0, 5, 6, 7};
+ for(int pass = 0; pass < 7; pass++) {
+ device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ /* Get the unfiltered pass and its variance from the RenderBuffers. */
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ /* Smooth the pass and store the result in the denoising buffers. */
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+ }
+ }
+
+ /* Copy color passes. */
+ {
+ int mean_from[] = {20, 21, 22};
+ int variance_from[] = {23, 24, 25};
+ int mean_to[] = { 8, 9, 10};
+ int variance_to[] = {11, 12, 13};
+ int num_color_passes = 3;
+
+ device_only_memory<float> temp_color;
+ temp_color.resize(3*buffer.pass_stride);
+ device->mem_alloc("Denoising temporary color", temp_color, MEM_READ_WRITE);
+
+ for(int pass = 0; pass < num_color_passes; pass++) {
+ device_sub_ptr color_pass(device, temp_color, pass*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ }
+
+ {
+ device_sub_ptr depth_pass (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_pass(device, buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr output_pass (device, buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+ }
+
+ device->mem_free(temp_color);
+ }
+
+ storage.w = filter_area.z;
+ storage.h = filter_area.w;
+ storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+ storage.rank.resize(storage.w*storage.h);
+ device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+ device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+ functions.construct_transform();
+
+ device_only_memory<float> temporary_1;
+ device_only_memory<float> temporary_2;
+ temporary_1.resize(buffer.w*buffer.h);
+ temporary_2.resize(buffer.w*buffer.h);
+ device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+ device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+ reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+ reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+ storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+ storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+ device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+ device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+ reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+ int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+ reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+ render_buffer.stride,
+ render_buffer.pass_stride,
+ render_buffer.denoising_clean_offset);
+ reconstruction_state.source_w = rect.z-rect.x;
+ reconstruction_state.source_h = rect.w-rect.y;
+
+ {
+ device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr);
+ }
+
+ device->mem_free(storage.XtWX);
+ device->mem_free(storage.XtWY);
+ device->mem_free(storage.transform);
+ device->mem_free(storage.rank);
+ device->mem_free(temporary_1);
+ device->mem_free(temporary_2);
+ device->mem_free(buffer.mem);
+ device->mem_free(tiles_mem);
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..def7b72f67d
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+ /* Parameters of the denoising algorithm. */
+ int radius;
+ float nlm_k_2;
+ float pca_threshold;
+
+ /* Pointer and parameters of the RenderBuffers. */
+ struct RenderBuffers {
+ int denoising_data_offset;
+ int denoising_clean_offset;
+ int pass_stride;
+ int offset;
+ int stride;
+ device_ptr ptr;
+ int samples;
+ } render_buffer;
+
+ TilesInfo *tiles;
+ device_vector<int> tiles_mem;
+ void tiles_from_rendertiles(RenderTile *rtiles);
+
+ int4 rect;
+ int4 filter_area;
+
+ struct DeviceFunctions {
+ function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
+ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
+ device_ptr variance_ptr, /* Contains the variance of the guide image. */
+ device_ptr out_ptr /* The filtered output is written into this image. */
+ )> non_local_means;
+ function<bool(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr
+ )> reconstruct;
+ function<bool()> construct_transform;
+
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect
+ )> combine_halves;
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr
+ )> divide_shadow;
+ function<bool(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr
+ )> get_feature;
+ function<bool(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr
+ )> detect_outliers;
+ function<bool(device_ptr*)> set_tiles;
+ } functions;
+
+ /* Stores state of the current Reconstruction operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct ReconstructionState {
+ device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+
+ int4 filter_rect;
+ int4 buffer_params;
+
+ int source_w;
+ int source_h;
+ } reconstruction_state;
+
+ /* Stores state of the current NLM operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct NLMState {
+ device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+ device_ptr temporary_3_ptr;
+
+ int r; /* Search radius of the filter. */
+ int f; /* Patch size of the filter. */
+ float a; /* Variance compensation factor in the MSE estimation. */
+ float k_2; /* Squared value of the k parameter of the filter. */
+
+ void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ } nlm_state;
+
+ struct Storage {
+ device_only_memory<float> transform;
+ device_only_memory<int> rank;
+ device_only_memory<float> XtWX;
+ device_only_memory<float3> XtWY;
+ int w;
+ int h;
+ } storage;
+
+ DenoisingTask(Device *device) : device(device) {}
+
+ void init_from_devicetask(const DeviceTask &task);
+
+ bool run_denoising();
+
+ struct DenoiseBuffers {
+ int pass_stride;
+ int passes;
+ int w;
+ int h;
+ device_only_memory<float> mem;
+ } buffer;
+
+protected:
+ Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 4b10514a9d2..b63dd00068b 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@
CCL_NAMESPACE_BEGIN
+class Device;
+
enum MemoryType {
MEM_READ_ONLY,
MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 3;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
{
public:
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+ size_t memory_elements_size(int elements) {
+ return elements*data_elements*datatype_size(data_type);
+ }
/* data information */
DataType data_type;
@@ -213,6 +218,22 @@ protected:
device_memory& operator = (const device_memory&);
};
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+ device_only_memory()
+ {
+ data_type = device_type_traits<T>::data_type;
+ data_elements = max(device_type_traits<T>::num_elements, 1);
+ }
+
+ void resize(size_t num)
+ {
+ device_memory::resize(num*sizeof(T));
+ }
+};
+
/* Device Vector */
template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
array<T> data;
};
+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+ device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+ ~device_sub_ptr();
+ /* No copying. */
+ device_sub_ptr& operator = (const device_sub_ptr&);
+
+ device_ptr operator*() const
+ {
+ return ptr;
+ }
+protected:
+ Device *device;
+ device_ptr ptr;
+};
+
CCL_NAMESPACE_END
#endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 624260a81c8..bc505b676fc 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -299,6 +299,60 @@ public:
return -1;
}
+ void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ /* If the tile was rendered on another device, copy its memory to
+ * to the current device now, for the duration of the denoising task.
+ * Note that this temporarily modifies the RenderBuffers and calls
+ * the device, so this function is not thread safe. */
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ tiles[i].buffers->copy_from_device();
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = 0;
+ sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+ sub_device->mem_copy_to(mem);
+ tiles[i].buffer = mem.device_pointer;
+ mem.device_pointer = original_ptr;
+ }
+ }
+ }
+
+ void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = tiles[i].buffer;
+
+ /* Copy denoised tile to the host. */
+ if(i == 4) {
+ tiles[i].buffers->copy_from_device(sub_device);
+ }
+
+ size_t mem_size = mem.device_size;
+ sub_device->mem_free(mem);
+ mem.device_pointer = original_ptr;
+ mem.device_size = mem_size;
+
+ /* Copy denoised tile to the original device. */
+ if(i == 4) {
+ tiles[i].buffers->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
+
int get_split_task_count(DeviceTask& task)
{
int total_tasks = 0;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index edd2047debc..681b8214b03 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -130,10 +130,22 @@ string device_opencl_capabilities(void)
opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
result += string_printf("%s: %s\n", name, data); \
} while(false)
+#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+ do { \
+ char data[1024] = "\0"; \
+ size_t length = 0; \
+ if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+ if(length != 0 && data[0] != '\0') { \
+ result += string_printf("%s: %s\n", name, data); \
+ } \
+ } \
+ } while(false)
#define APPEND_PLATFORM_STRING_INFO(id, name, what) \
APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
#define APPEND_DEVICE_STRING_INFO(id, name, what) \
APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+ APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
vector<cl_device_id> device_ids;
for(cl_uint platform = 0; platform < num_platforms; ++platform) {
@@ -167,6 +179,7 @@ string device_opencl_capabilities(void)
result += string_printf("\t\tDevice: #%u\n", device);
APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+ APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 981ec74fe56..d2b3a89fa98 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -19,6 +19,7 @@
#include "kernel/kernel_types.h"
#include "kernel/split/kernel_split_data_types.h"
+#include "util/util_logging.h"
#include "util/util_time.h"
CCL_NAMESPACE_BEGIN
@@ -38,12 +39,15 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
kernel_do_volume = NULL;
kernel_queue_enqueue = NULL;
kernel_indirect_background = NULL;
+ kernel_shader_setup = NULL;
+ kernel_shader_sort = NULL;
kernel_shader_eval = NULL;
kernel_holdout_emission_blurring_pathtermination_ao = NULL;
kernel_subsurface_scatter = NULL;
kernel_direct_lighting = NULL;
kernel_shadow_blocked_ao = NULL;
kernel_shadow_blocked_dl = NULL;
+ kernel_enqueue_inactive = NULL;
kernel_next_iteration_setup = NULL;
kernel_indirect_subsurface = NULL;
kernel_buffer_update = NULL;
@@ -63,12 +67,15 @@ DeviceSplitKernel::~DeviceSplitKernel()
delete kernel_do_volume;
delete kernel_queue_enqueue;
delete kernel_indirect_background;
+ delete kernel_shader_setup;
+ delete kernel_shader_sort;
delete kernel_shader_eval;
delete kernel_holdout_emission_blurring_pathtermination_ao;
delete kernel_subsurface_scatter;
delete kernel_direct_lighting;
delete kernel_shadow_blocked_ao;
delete kernel_shadow_blocked_dl;
+ delete kernel_enqueue_inactive;
delete kernel_next_iteration_setup;
delete kernel_indirect_subsurface;
delete kernel_buffer_update;
@@ -88,12 +95,15 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
LOAD_KERNEL(do_volume);
LOAD_KERNEL(queue_enqueue);
LOAD_KERNEL(indirect_background);
+ LOAD_KERNEL(shader_setup);
+ LOAD_KERNEL(shader_sort);
LOAD_KERNEL(shader_eval);
LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
LOAD_KERNEL(subsurface_scatter);
LOAD_KERNEL(direct_lighting);
LOAD_KERNEL(shadow_blocked_ao);
LOAD_KERNEL(shadow_blocked_dl);
+ LOAD_KERNEL(enqueue_inactive);
LOAD_KERNEL(next_iteration_setup);
LOAD_KERNEL(indirect_subsurface);
LOAD_KERNEL(buffer_update);
@@ -108,6 +118,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
{
uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+ VLOG(1) << "Split state element size: "
+ << string_human_readable_number(size_per_element) << " bytes. ("
+ << string_human_readable_size(size_per_element) << ").";
return max_buffer_size / size_per_element;
}
@@ -156,13 +169,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
/* Allocate work_pool_wgs memory. */
- work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+ work_pool_wgs.resize(max_work_groups);
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
- queue_index.resize(NUM_QUEUES * sizeof(int));
+ queue_index.resize(NUM_QUEUES);
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
- use_queues_flag.resize(sizeof(char));
+ use_queues_flag.resize(1);
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
ray_state.resize(num_global_elements);
@@ -227,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
bool activeRaysAvailable = true;
+ double cancel_time = DBL_MAX;
while(activeRaysAvailable) {
/* Do path-iteration in host [Enqueue Path-iteration kernels. */
@@ -236,18 +250,29 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
- if(task->get_cancel()) {
+ if(task->get_cancel() && cancel_time == DBL_MAX) {
+ /* Wait up to twice as many seconds for current samples to finish
+ * to avoid artifacts in render result from ending too soon.
+ */
+ cancel_time = time_dt() + 2.0 * time_multiplier;
+ }
+
+ if(time_dt() > cancel_time) {
return true;
}
}
@@ -271,7 +296,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
}
- if(task->get_cancel()) {
+ if(time_dt() > cancel_time) {
return true;
}
}
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 55548122c0c..9c42cb58520 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -61,12 +61,15 @@ private:
SplitKernelFunction *kernel_do_volume;
SplitKernelFunction *kernel_queue_enqueue;
SplitKernelFunction *kernel_indirect_background;
+ SplitKernelFunction *kernel_shader_setup;
+ SplitKernelFunction *kernel_shader_sort;
SplitKernelFunction *kernel_shader_eval;
SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
SplitKernelFunction *kernel_subsurface_scatter;
SplitKernelFunction *kernel_direct_lighting;
SplitKernelFunction *kernel_shadow_blocked_ao;
SplitKernelFunction *kernel_shadow_blocked_dl;
+ SplitKernelFunction *kernel_enqueue_inactive;
SplitKernelFunction *kernel_next_iteration_setup;
SplitKernelFunction *kernel_indirect_subsurface;
SplitKernelFunction *kernel_buffer_update;
@@ -78,16 +81,16 @@ private:
*/
device_memory split_data;
device_vector<uchar> ray_state;
- device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+ device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
/* Flag to make sceneintersect and lampemission kernel use queues. */
- device_memory use_queues_flag;
+ device_only_memory<char> use_queues_flag;
/* Approximate time it takes to complete one sample */
double avg_time_per_sample;
/* Work pool with respect to each work group. */
- device_memory work_pool_wgs;
+ device_only_memory<unsigned int> work_pool_wgs;
/* clos_max value for which the kernels have been loaded currently. */
int current_max_closure;
@@ -122,7 +125,8 @@ public:
device_memory& use_queues_flag,
device_memory& work_pool_wgs) = 0;
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&) = 0;
virtual int2 split_kernel_local_size() = 0;
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
};
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index ca303365627..3bc4c310283 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
if(type == SHADER) {
num = min(shader_w, num);
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
}
else {
num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
tasks.push_back(task);
}
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
for(int i = 0; i < num; i++)
tasks.push_back(*this);
}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
{
- if((type != PATH_TRACE) &&
+ if((type != RENDER) &&
(type != SHADER))
return;
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index feee89fd6e4..44a1efff1f5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;
class DeviceTask : public Task {
public:
- typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+ typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
Type type;
int x, y, w, h;
@@ -53,7 +53,7 @@ public:
int passes_size;
- explicit DeviceTask(Type type = PATH_TRACE);
+ explicit DeviceTask(Type type = RENDER);
int get_subtask_count(int num, int max_size = 0);
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
function<void(RenderTile&)> update_tile_sample;
function<void(RenderTile&)> release_tile;
function<bool(void)> get_cancel;
+ function<void(RenderTile*, Device*)> map_neighbor_tiles;
+ function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+ int denoising_radius;
+ float denoising_strength;
+ float denoising_feature_strength;
+ bool denoising_relative_pca;
+ int pass_stride;
+ int pass_denoising_data;
+ int pass_denoising_clean;
bool need_finish_queue;
bool integrator_branched;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 764216d0dfa..78ca377d933 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
#ifdef WITH_OPENCL
#include "device/device.h"
+#include "device/device_denoising.h"
#include "util/util_map.h"
#include "util/util_param.h"
@@ -26,24 +27,24 @@
CCL_NAMESPACE_BEGIN
+/* Disable workarounds, seems to be working fine on latest drivers. */
+#define CYCLES_DISABLE_DRIVER_WORKAROUNDS
+
/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
# undef clEnqueueNDRangeKernel
# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
- clFinish(a); \
CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
clFinish(a);
# undef clEnqueueWriteBuffer
# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
- clFinish(a); \
CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
clFinish(a);
# undef clEnqueueReadBuffer
# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
- clFinish(a); \
CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
clFinish(a);
#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
@@ -86,7 +87,7 @@ public:
string *error = NULL);
static bool device_version_check(cl_device_id device,
string *error = NULL);
- static string get_hardware_id(string platform_name,
+ static string get_hardware_id(const string& platform_name,
cl_device_id device_id);
static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
bool force_all = false);
@@ -132,6 +133,13 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
+ static bool get_driver_version(cl_device_id device_id,
+ int *major,
+ int *minor,
+ cl_int* error = NULL);
+
+ static int mem_address_alignment(cl_device_id device_id);
+
/* Get somewhat more readable device name.
* Main difference is AMD OpenCL here which only gives code name
* for the regular device name. This will give more sane device
@@ -221,7 +229,7 @@ public:
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+ string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -242,17 +250,17 @@ public:
public:
OpenCLProgram() : loaded(false), device(NULL) {}
OpenCLProgram(OpenCLDeviceBase *device,
- string program_name,
- string kernel_name,
- string kernel_build_options,
+ const string& program_name,
+ const string& kernel_name,
+ const string& kernel_build_options,
bool use_stdout = true);
~OpenCLProgram();
void add_kernel(ustring name);
void load();
- bool is_loaded() { return loaded; }
- string get_log() { return log; }
+ bool is_loaded() const { return loaded; }
+ const string& get_log() const { return log; }
void report_error();
cl_kernel operator()();
@@ -266,8 +274,8 @@ public:
bool load_binary(const string& clbin, const string *debug_src = NULL);
bool save_binary(const string& clbin);
- void add_log(string msg, bool is_debug);
- void add_error(string msg);
+ void add_log(const string& msg, bool is_debug);
+ void add_error(const string& msg);
bool loaded;
cl_program program;
@@ -285,7 +293,7 @@ public:
map<ustring, cl_kernel> kernels;
};
- OpenCLProgram base_program;
+ OpenCLProgram base_program, denoising_program;
typedef map<string, device_vector<uchar>*> ConstMemMap;
typedef map<string, device_ptr> MemMap;
@@ -323,6 +331,9 @@ public:
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
void mem_free(device_memory& mem);
+
+ int mem_address_alignment();
+
void const_copy_to(const char *name, void *host, size_t size);
void tex_alloc(const char *name,
device_memory& mem,
@@ -331,12 +342,14 @@ public:
void tex_free(device_memory& mem);
size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+ void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
void shader(DeviceTask& task);
+ void denoise(RenderTile& tile, const DeviceTask& task);
+
class OpenCLDeviceTask : public DeviceTask {
public:
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -370,9 +383,51 @@ public:
virtual void thread_run(DeviceTask * /*task*/) = 0;
+ virtual bool is_split_kernel() = 0;
+
protected:
string kernel_build_options(const string *debug_src = NULL);
+ void mem_zero_kernel(device_ptr ptr, size_t size);
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task);
+ bool denoising_construct_transform(DenoisingTask *task);
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task);
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task);
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task);
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task);
+ bool denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task);
+
+ device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+ void mem_free_sub_ptr(device_ptr ptr);
+
class ArgumentWrapper {
public:
ArgumentWrapper() : size(0), pointer(NULL)
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 52d0662a8e3..509da7a0a84 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -20,6 +20,7 @@
#include "kernel/kernel_types.h"
+#include "util/util_algorithm.h"
#include "util/util_foreach.h"
#include "util/util_logging.h"
#include "util/util_md5.h"
@@ -213,8 +214,24 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
+ denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+ denoising_program.add_kernel(ustring("filter_divide_shadow"));
+ denoising_program.add_kernel(ustring("filter_get_feature"));
+ denoising_program.add_kernel(ustring("filter_detect_outliers"));
+ denoising_program.add_kernel(ustring("filter_combine_halves"));
+ denoising_program.add_kernel(ustring("filter_construct_transform"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+ denoising_program.add_kernel(ustring("filter_nlm_blur"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+ denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+ denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+ denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+ denoising_program.add_kernel(ustring("filter_finalize"));
+ denoising_program.add_kernel(ustring("filter_set_tiles"));
+
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
+ programs.push_back(&denoising_program);
/* Call actual class to fill the vector with its programs. */
if(!load_kernels(requested_features, programs)) {
return false;
@@ -260,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp
size_t size = mem.memory_size();
+ /* check there is enough memory available for the allocation */
+ cl_ulong max_alloc_size = 0;
+ clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+ if(DebugFlags().opencl.mem_limit) {
+ max_alloc_size = min(max_alloc_size,
+ cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+ }
+
+ if(size > max_alloc_size) {
+ string error = "Scene too complex to fit in available memory.";
+ if(name != NULL) {
+ error += string_printf(" (allocating buffer %s failed.)", name);
+ }
+ set_error(error);
+
+ return;
+ }
+
cl_mem_flags mem_flag;
void *mem_ptr = NULL;
@@ -322,37 +358,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
NULL, NULL));
}
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
{
- if(mem.device_pointer) {
- if(base_program.is_loaded()) {
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
- cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
+ cl_mem d_buffer = CL_MEM_PTR(mem);
+ cl_ulong d_offset = 0;
+ cl_ulong d_size = 0;
- while(d_offset < mem.memory_size()) {
- d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+ while(d_offset < size) {
+ d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
- ckZeroBuffer,
- 2,
- NULL,
- global_size,
- NULL,
- 0,
- NULL,
- NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+ ckZeroBuffer,
+ 2,
+ NULL,
+ global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
- d_offset += d_size;
- }
+ d_offset += d_size;
+ }
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+ if(mem.device_pointer) {
+ if(base_program.is_loaded()) {
+ mem_zero_kernel(mem.device_pointer, mem.memory_size());
}
if(mem.data_pointer) {
@@ -396,6 +437,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
+int OpenCLDeviceBase::mem_address_alignment()
+{
+ return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+ cl_mem_flags mem_flag;
+ if(type == MEM_READ_ONLY)
+ mem_flag = CL_MEM_READ_ONLY;
+ else if(type == MEM_WRITE_ONLY)
+ mem_flag = CL_MEM_WRITE_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ cl_buffer_region info;
+ info.origin = mem.memory_elements_size(offset);
+ info.size = mem.memory_elements_size(size);
+
+ device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+ mem_flag,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &info,
+ &ciErr);
+ opencl_assert_err(ciErr, "clCreateSubBuffer");
+ return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+ if(device_pointer && device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+ }
+}
+
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
{
ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +525,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@@ -458,6 +534,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+ if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+ workgroup_size = max_workgroup_size;
+ }
+
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +623,380 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+{
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+ cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+ mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+ mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+ cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+ int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy, guide_mem, variance_mem,
+ difference, local_rect, w, 0, a, k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference, blurDifference, local_rect, w, f);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference, difference, local_rect, w, f);
+ kernel_set_args(ckNLMUpdateOutput, 0,
+ dx, dy, blurDifference, image_mem,
+ out_mem, weightAccum, local_rect, w, f);
+
+ enqueue_kernel(ckNLMCalcDifference, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMCalcWeight, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMUpdateOutput, w, h);
+ }
+
+ int4 local_rect = make_int4(0, 0, w, h);
+ kernel_set_args(ckNLMNormalize, 0,
+ out_mem, weightAccum, local_rect, w);
+ enqueue_kernel(ckNLMNormalize, w, h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+ cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+ kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ task->filter_area,
+ task->rect,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+
+ enqueue_kernel(ckFilterConstructTransform,
+ task->storage.w,
+ task->storage.h,
+ 256);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cl_mem color_mem = CL_MEM_PTR(color_ptr);
+ cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy,
+ color_mem,
+ color_variance_mem,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ a, task->nlm_k_2);
+ enqueue_kernel(ckNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMCalcWeight,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ /* Reuse previous arguments. */
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMConstructGramian, 0,
+ dx, dy,
+ blurDifference,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ local_rect,
+ task->reconstruction_state.filter_rect,
+ task->buffer.w,
+ task->buffer.h,
+ f,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckNLMConstructGramian,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h,
+ 256);
+ }
+
+ kernel_set_args(ckFinalize, 0,
+ task->buffer.w,
+ task->buffer.h,
+ output_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->filter_area,
+ task->reconstruction_state.buffer_params,
+ task->render_buffer.samples);
+ enqueue_kernel(ckFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task)
+{
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+ kernel_set_args(ckFilterCombineHalves, 0,
+ mean_mem,
+ variance_mem,
+ a_mem,
+ b_mem,
+ rect,
+ r);
+ enqueue_kernel(ckFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+{
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+ cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+ cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterDivideShadow, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ a_mem,
+ b_mem,
+ sample_variance_mem,
+ sv_variance_mem,
+ buffer_variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+{
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterGetFeature, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ mean_offset,
+ variance_offset,
+ mean_mem,
+ variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+ kernel_set_args(ckFilterDetectOutliers, 0,
+ image_mem,
+ variance_mem,
+ depth_mem,
+ output_mem,
+ task->rect,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckFilterDetectOutliers,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task)
+{
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+ mem_copy_to(task->tiles_mem);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+ kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+ for(int i = 0; i < 9; i++) {
+ cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+ kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+ }
+
+ enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+ return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+ DenoisingTask denoising(this);
+
+ denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+ denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+}
+
void OpenCLDeviceBase::shader(DeviceTask& task)
{
/* cast arguments to cl types */
@@ -612,7 +1066,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
{
- string build_options = "-cl-fast-relaxed-math ";
+ string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
if(platform_name == "NVIDIA CUDA") {
build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
@@ -792,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel(
}
string OpenCLDeviceBase::build_options_for_base_program(
- const DeviceRequestedFeatures& /*requested_features*/)
+ const DeviceRequestedFeatures& requested_features)
{
/* TODO(sergey): By default we compile all features, meaning
* mega kernel is not getting feature-based optimizations.
@@ -800,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program(
* Ideally we need always compile kernel with as less features
* enabled as possible to keep performance at it's max.
*/
+
+ /* For now disable baking when not in use as this has major
+ * impact on kernel build times.
+ */
+ if(!requested_features.use_baking) {
+ return "-D__NO_BAKING__";
+ }
+
return "";
}
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index a2fd1d71156..06c15bcf401 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ if(tile.task == RenderTile::PATH_TRACE) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
+
+ path_trace(tile, sample);
- path_trace(tile, sample);
+ tile.sample = sample + 1;
- tile.sample = sample + 1;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+ /* Complete kernel execution before release tile */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
- /* Complete kernel execution before release tile */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
-
task->release_tile(tile);
}
}
}
+
+ bool is_split_kernel()
+ {
+ return false;
+ }
};
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index b8df57ec7b9..76d9983e9a2 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -25,6 +25,7 @@
#include "device/device_split_kernel.h"
+#include "util/util_algorithm.h"
#include "util/util_logging.h"
#include "util/util_md5.h"
#include "util/util_path.h"
@@ -70,6 +71,10 @@ public:
delete split_kernel;
}
+ virtual bool show_samples() const {
+ return true;
+ }
+
virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
{
@@ -100,7 +105,7 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -123,21 +128,29 @@ public:
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- split_kernel->path_trace(task,
- tile,
- kgbuffer,
- *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ assert(tile.task == RenderTile::PATH_TRACE);
+ split_kernel->path_trace(task,
+ tile,
+ kgbuffer,
+ *const_mem_map["__data"]);
+
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
task->release_tile(tile);
}
@@ -146,6 +159,11 @@ public:
}
}
+ bool is_split_kernel()
+ {
+ return true;
+ }
+
protected:
/* ** Those guys are for workign around some compiler-specific bugs ** */
@@ -159,17 +177,62 @@ protected:
friend class OpenCLSplitKernelFunction;
};
+struct CachedSplitMemory {
+ int id;
+ device_memory *split_data;
+ device_memory *ray_state;
+ device_ptr *rng_state;
+ device_memory *queue_index;
+ device_memory *use_queues_flag;
+ device_memory *work_pools;
+ device_ptr *buffer;
+};
+
class OpenCLSplitKernelFunction : public SplitKernelFunction {
public:
OpenCLDeviceSplitKernel* device;
OpenCLDeviceBase::OpenCLProgram program;
+ CachedSplitMemory& cached_memory;
+ int cached_id;
+
+ OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+ device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+ {
+ }
- OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
- ~OpenCLSplitKernelFunction() { program.release(); }
+ ~OpenCLSplitKernelFunction()
+ {
+ program.release();
+ }
virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
{
- device->kernel_set_args(program(), 0, kg, data);
+ if(cached_id != cached_memory.id) {
+ cl_uint start_arg_index =
+ device->kernel_set_args(program(),
+ 0,
+ kg,
+ data,
+ *cached_memory.split_data,
+ *cached_memory.ray_state,
+ *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+ device->set_kernel_arg_mem(program(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+ start_arg_index +=
+ device->kernel_set_args(program(),
+ start_arg_index,
+ *cached_memory.queue_index,
+ *cached_memory.use_queues_flag,
+ *cached_memory.work_pools,
+ *cached_memory.buffer);
+
+ cached_id = cached_memory.id;
+ }
device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
program(),
@@ -196,14 +259,15 @@ public:
class OpenCLSplitKernel : public DeviceSplitKernel {
OpenCLDeviceSplitKernel *device;
+ CachedSplitMemory cached_memory;
public:
explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
}
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
const DeviceRequestedFeatures& requested_features)
{
- OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+ OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
bool single_program = OpenCLInfo::use_single_program();
kernel->program =
@@ -332,6 +396,15 @@ public:
return false;
}
+ cached_memory.split_data = &split_data;
+ cached_memory.ray_state = &ray_state;
+ cached_memory.rng_state = &rtile.rng_state;
+ cached_memory.queue_index = &queue_index;
+ cached_memory.use_queues_flag = &use_queues_flag;
+ cached_memory.work_pools = &work_pool_wgs;
+ cached_memory.buffer = &rtile.buffer;
+ cached_memory.id++;
+
return true;
}
@@ -351,12 +424,18 @@ public:
cl_ulong max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+ if(DebugFlags().opencl.mem_limit) {
+ max_buffer_size = min(max_buffer_size,
+ cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
+ }
+
VLOG(1) << "Maximum device allocation size: "
<< string_human_readable_number(max_buffer_size) << " bytes. ("
<< string_human_readable_size(max_buffer_size) << ").";
size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
- int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+ int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;
}
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index fe1c65a2224..0d34af3e040 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5()
}
OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
- string program_name,
- string kernel_file,
- string kernel_build_options,
+ const string& program_name,
+ const string& kernel_file,
+ const string& kernel_build_options,
bool use_stdout)
: device(device),
program_name(program_name),
@@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release()
}
}
-void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug)
{
if(!use_stdout) {
log += msg + "\n";
@@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
}
}
-void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg)
{
if(use_stdout) {
fprintf(stderr, "%s\n", msg.c_str());
@@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name,
if(!get_device_name(device_id, &device_name)) {
return false;
}
+
+ int driver_major = 0;
+ int driver_minor = 0;
+ if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
+ return false;
+ }
+ VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
* (aka, it will not be on Intel framework). This isn't supported
* and needs an explicit blacklist.
@@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name,
if(platform_name == "AMD Accelerated Parallel Processing" &&
device_type == CL_DEVICE_TYPE_GPU)
{
+ if(driver_major < 2236) {
+ VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+ return false;
+ }
+ const char *blacklist[] = {
+ /* GCN 1 */
+ "Tahiti", "Pitcairn", "Capeverde", "Oland",
+ NULL
+ };
+ for (int i = 0; blacklist[i] != NULL; i++) {
+ if(device_name == blacklist[i]) {
+ VLOG(1) << "AMD device " << device_name << " not supported";
+ return false;
+ }
+ }
return true;
}
if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
@@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device,
return true;
}
-string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
{
if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
/* Use cl_amd_device_topology extension. */
@@ -902,7 +925,7 @@ bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
{
string platform_name;
- if (!get_platform_name(platform_id, &platform_name)) {
+ if(!get_platform_name(platform_id, &platform_name)) {
return "";
}
return platform_name;
@@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
CL_DEVICE_BOARD_NAME_AMD,
sizeof(board_name),
&board_name,
- &length) == CL_SUCCESS)
+ &length) == CL_SUCCESS)
{
if(length != 0 && board_name[0] != '\0') {
return board_name;
@@ -1073,6 +1096,48 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
+bool OpenCLInfo::get_driver_version(cl_device_id device_id,
+ int *major,
+ int *minor,
+ cl_int* error)
+{
+ char buffer[1024];
+ cl_int err;
+ if((err = clGetDeviceInfo(device_id,
+ CL_DRIVER_VERSION,
+ sizeof(buffer),
+ &buffer,
+ NULL)) != CL_SUCCESS)
+ {
+ if(error != NULL) {
+ *error = err;
+ }
+ return false;
+ }
+ if(error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ if(sscanf(buffer, "%d.%d", major, minor) < 2) {
+ VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+ return false;
+ }
+ return true;
+}
+
+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+ int base_align_bits;
+ if(clGetDeviceInfo(device_id,
+ CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+ sizeof(int),
+ &base_align_bits,
+ NULL) == CL_SUCCESS)
+ {
+ return base_align_bits/8;
+ }
+ return 1;
+}
+
CCL_NAMESPACE_END
#endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index dbc2ba2503a..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
+ kernels/cpu/kernel_sse2.cpp
+ kernels/cpu/kernel_sse3.cpp
+ kernels/cpu/kernel_sse41.cpp
+ kernels/cpu/kernel_avx.cpp
+ kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
+ kernels/cpu/kernel_split_sse2.cpp
+ kernels/cpu/kernel_split_sse3.cpp
+ kernels/cpu/kernel_split_sse41.cpp
+ kernels/cpu/kernel_split_avx.cpp
+ kernels/cpu/kernel_split_avx2.cpp
+ kernels/cpu/filter.cpp
+ kernels/cpu/filter_sse2.cpp
+ kernels/cpu/filter_sse3.cpp
+ kernels/cpu/filter_sse41.cpp
+ kernels/cpu/filter_avx.cpp
+ kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@@ -21,17 +37,22 @@ set(SRC
kernels/opencl/kernel_lamp_emission.cl
kernels/opencl/kernel_do_volume.cl
kernels/opencl/kernel_indirect_background.cl
+ kernels/opencl/kernel_shader_setup.cl
+ kernels/opencl/kernel_shader_sort.cl
kernels/opencl/kernel_shader_eval.cl
kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
kernels/opencl/kernel_subsurface_scatter.cl
kernels/opencl/kernel_direct_lighting.cl
kernels/opencl/kernel_shadow_blocked_ao.cl
kernels/opencl/kernel_shadow_blocked_dl.cl
+ kernels/opencl/kernel_enqueue_inactive.cl
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
+ kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
+ kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@@ -93,12 +114,18 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
+ kernels/cpu/filter_cpu.h
+ kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_config.h
)
+set(SRC_KERNELS_OPENCL_HEADERS
+ kernels/opencl/kernel_split_function.h
+)
+
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -120,6 +147,8 @@ set(SRC_CLOSURE_HEADERS
closure/bssrdf.h
closure/emissive.h
closure/volume.h
+ closure/bsdf_principled_diffuse.h
+ closure/bsdf_principled_sheen.h
)
set(SRC_SVM_HEADERS
@@ -186,6 +215,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
+set(SRC_FILTER_HEADERS
+ filter/filter.h
+ filter/filter_defines.h
+ filter/filter_features.h
+ filter/filter_features_sse.h
+ filter/filter_kernel.h
+ filter/filter_nlm_cpu.h
+ filter/filter_nlm_gpu.h
+ filter/filter_prefilter.h
+ filter/filter_reconstruction.h
+ filter/filter_transform.h
+ filter/filter_transform_gpu.h
+ filter/filter_transform_sse.h
+)
+
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@@ -194,17 +238,52 @@ set(SRC_UTIL_HEADERS
../util/util_math.h
../util/util_math_fast.h
../util/util_math_intersect.h
+ ../util/util_math_float2.h
+ ../util/util_math_float3.h
+ ../util/util_math_float4.h
+ ../util/util_math_int2.h
+ ../util/util_math_int3.h
+ ../util/util_math_int4.h
+ ../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
../util/util_types.h
+ ../util/util_types_float2.h
+ ../util/util_types_float2_impl.h
+ ../util/util_types_float3.h
+ ../util/util_types_float3_impl.h
+ ../util/util_types_float4.h
+ ../util/util_types_float4_impl.h
+ ../util/util_types_int2.h
+ ../util/util_types_int2_impl.h
+ ../util/util_types_int3.h
+ ../util/util_types_int3_impl.h
+ ../util/util_types_int4.h
+ ../util/util_types_int4_impl.h
+ ../util/util_types_uchar2.h
+ ../util/util_types_uchar2_impl.h
+ ../util/util_types_uchar3.h
+ ../util/util_types_uchar3_impl.h
+ ../util/util_types_uchar4.h
+ ../util/util_types_uchar4_impl.h
+ ../util/util_types_uint2.h
+ ../util/util_types_uint2_impl.h
+ ../util/util_types_uint3.h
+ ../util/util_types_uint3_impl.h
+ ../util/util_types_uint4.h
+ ../util/util_types_uint4_impl.h
+ ../util/util_types_vector3.h
+ ../util/util_types_vector3_impl.h
)
set(SRC_SPLIT_HEADERS
+ split/kernel_branched.h
split/kernel_buffer_update.h
split/kernel_data_init.h
split/kernel_direct_lighting.h
split/kernel_do_volume.h
+ split/kernel_enqueue_inactive.h
split/kernel_holdout_emission_blurring_pathtermination_ao.h
split/kernel_indirect_background.h
split/kernel_indirect_subsurface.h
@@ -213,6 +292,8 @@ set(SRC_SPLIT_HEADERS
split/kernel_path_init.h
split/kernel_queue_enqueue.h
split/kernel_scene_intersect.h
+ split/kernel_shader_setup.h
+ split/kernel_shader_sort.h
split/kernel_shader_eval.h
split/kernel_shadow_blocked_ao.h
split/kernel_shadow_blocked_dl.h
@@ -256,23 +337,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
+ set(cuda_filter_sources kernels/cuda/filter.cu
+ ${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_FILTER_HEADERS}
+ ${SRC_UTIL_HEADERS}
+ )
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
- if(${split})
- set(cuda_extra_flags "-D__SPLIT__")
- set(cuda_cubin kernel_split)
- else()
- set(cuda_extra_flags "")
- set(cuda_cubin kernel)
- endif()
-
+ macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
- set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
- set(cuda_cubin ${cuda_cubin}_experimental)
+ set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+ set(name ${name}_experimental)
endif()
- set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+ set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -286,11 +365,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
- if(split)
- set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
- else()
- set(cuda_kernel_src "/kernels/cuda/kernel.cu")
- endif()
+ set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@@ -304,13 +379,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
- ${cuda_extra_flags}
+ ${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
- DEPENDS ${cuda_sources})
+ DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@@ -324,11 +399,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@@ -349,41 +425,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
- list(APPEND SRC
- kernels/cpu/kernel_sse2.cpp
- kernels/cpu/kernel_sse3.cpp
- kernels/cpu/kernel_sse41.cpp
- kernels/cpu/kernel_split_sse2.cpp
- kernels/cpu/kernel_split_sse3.cpp
- kernels/cpu/kernel_split_sse41.cpp
- )
-
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
- list(APPEND SRC
- kernels/cpu/kernel_avx.cpp
- kernels/cpu/kernel_split_avx.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
- list(APPEND SRC
- kernels/cpu/kernel_avx2.cpp
- kernels/cpu/kernel_split_avx2.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@@ -391,8 +456,10 @@ add_library(cycles_kernel
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_KERNELS_OPENCL_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
+ ${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@@ -422,21 +489,28 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 9139b99353a..86a00d2124d 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -27,6 +27,8 @@
#include "kernel/closure/bsdf_ashikhmin_shirley.h"
#include "kernel/closure/bsdf_toon.h"
#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
#ifdef __SUBSURFACE__
# include "kernel/closure/bssrdf.h"
#endif
@@ -86,16 +88,21 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
@@ -130,6 +137,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -188,14 +206,19 @@ float3 bsdf_eval(KernelGlobals *kg,
eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -222,6 +245,15 @@ float3 bsdf_eval(KernelGlobals *kg,
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -256,14 +288,19 @@ float3 bsdf_eval(KernelGlobals *kg,
eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -290,6 +327,15 @@ float3 bsdf_eval(KernelGlobals *kg,
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
#ifdef __SVM__
switch(sc->type) {
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
bsdf_microfacet_multi_ggx_blur(sc, roughness);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
bsdf_microfacet_ggx_blur(sc, roughness);
break;
@@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
case CLOSURE_BSDF_REFLECTION_ID:
case CLOSURE_BSDF_REFRACTION_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
@@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ return bsdf_principled_diffuse_merge(a, b);
+#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
return volume_henyey_greenstein_merge(a, b);
@@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+ if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ return true;
+ }
+
+ if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+ return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+ }
+
+ return false;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
- float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} DiffuseBsdf;
/* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float3 *colors;
} DiffuseRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 1c7b3eb9ddd..b12e248f0a3 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -36,7 +36,8 @@
CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct MicrofacetExtra {
- float3 color;
+ float3 color, cspec0;
+ float clearcoat;
} MicrofacetExtra;
typedef ccl_addr_space struct MicrofacetBsdf {
@@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
- float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */
@@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
return normalize(make_float3(-slope_x, -slope_y, 1.0f));
}
+/* Calculate the reflection color
+ *
+ * If fresnel is used, the color is an interpolation of the F0 color and white
+ * with respect to the fresnel
+ *
+ * Else it is simply white
+ */
+ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) {
+ float3 F = make_float3(1.0f, 1.0f, 1.0f);
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+ if(use_fresnel) {
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+
+ F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+ }
+
+ return F;
+}
+
+ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
+{
+ if(alpha >= 1.0f) return M_1_PI_F;
+ float alpha2 = alpha*alpha;
+ float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH;
+ return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
+}
+
/* GGX microfacet with Smith shadow-masking from:
*
* Microfacet Models for Refraction through Rough Surfaces
@@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
-
+
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL;
}
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
{
const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
@@ -273,16 +341,38 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = saturate(bsdf->alpha_y);
-
+
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL;
}
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = saturate(bsdf->alpha_y);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
@@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
float alpha2 = alpha_x * alpha_y;
float D, G1o, G1i;
+ bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
if(alpha_x == alpha_y) {
/* isotropic
* eq. 20: (F*G*D)/(4*in*on)
@@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
float cosThetaM2 = cosThetaM * cosThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
- D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+ if(is_principled_clearcoat) {
+ /* use GTR1 for clearcoat */
+ D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+ /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+ alpha2 = 0.0625f;
+ }
+ else {
+ /* use GTR2 otherwise */
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+ }
/* eq. 34: now calculate G1(i,m) and G1(o,m) */
G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
@@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
/* eq. 20 */
float common = D * 0.25f / cosNO;
- float out = G * common;
+
+ float3 F = reflection_color(bsdf, omega_in, m);
+ if(is_principled_clearcoat) {
+ F *= 0.25f * bsdf->extra->clearcoat;
+ }
+
+ float3 out = F * G * common;
/* eq. 2 in distribution of visible normals sampling
* pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
@@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
* pdf = pm * 0.25 / dot(m, I); */
*pdf = G1o * common;
- return make_float3(out, out, out);
+ return out;
}
return make_float3(0.0f, 0.0f, 0.0f);
@@ -489,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
/* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
+
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+ /* if fresnel is used, calculate the color with reflection_color(...) */
+ if(use_fresnel) {
+ *pdf = 1.0f;
+ *eval = reflection_color(bsdf, *omega_in, m);
+ }
+
label = LABEL_REFLECT | LABEL_SINGULAR;
}
else {
@@ -497,16 +617,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
float alpha2 = alpha_x * alpha_y;
float D, G1i;
+ bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
if(alpha_x == alpha_y) {
/* isotropic */
float cosThetaM2 = cosThetaM * cosThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
float tanThetaM2 = 1/(cosThetaM2) - 1;
- D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
/* eval BRDF*cosNI */
float cosNI = dot(N, *omega_in);
+ if(is_principled_clearcoat) {
+ /* use GTR1 for clearcoat */
+ D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+ /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+ alpha2 = 0.0625f;
+
+ /* recalculate G1o */
+ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+ }
+ else {
+ /* use GTR2 otherwise */
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+ }
+
/* eq. 34: now calculate G1(i,m) */
G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
}
@@ -538,10 +674,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
/* see eval function for derivation */
float common = (G1o * D) * 0.25f / cosNO;
- float out = G1i * common;
*pdf = common;
- *eval = make_float3(out, out, out);
+ float3 F = reflection_color(bsdf, *omega_in, m);
+ if(is_principled_clearcoat) {
+ F *= 0.25f * bsdf->extra->clearcoat;
+ }
+
+ *eval = G1i * common * F;
}
#ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 7d87727004f..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
}
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
{
- if(cosI > 0.9999f || cosI < 1e-6f) {
- const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
- const float phi = M_2PI_F * randU.y;
+ if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
+ const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+ const float phi = M_2PI_F * randy;
return make_float2(r*cosf(phi), r*sinf(phi));
}
- const float sinI = sqrtf(1.0f - cosI*cosI);
+ const float sinI = safe_sqrtf(1.0f - cosI*cosI);
const float tanI = sinI/cosI;
const float projA = 0.5f * (cosI + 1.0f);
if(projA < 0.0001f)
return make_float2(0.0f, 0.0f);
- const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+ const float A = 2.0f*randx*projA / cosI - 1.0f;
float tmp = A*A-1.0f;
if(fabsf(tmp) < 1e-7f)
return make_float2(0.0f, 0.0f);
@@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
float U2;
- if(randU.y >= 0.5f)
- U2 = 2.0f*(randU.y - 0.5f);
+ if(randy >= 0.5f)
+ U2 = 2.0f*(randy - 0.5f);
else
- U2 = 2.0f*(0.5f - randU.y);
+ U2 = 2.0f*(0.5f - randy);
const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
- if(randU.y >= 0.5f)
+ if(randy >= 0.5f)
return make_float2(slopeX, slopeY);
else
return make_float2(slopeX, -slopeY);
}
/* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
{
const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
- const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+ const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
return normalize(make_float3(-slope_x, -slope_y, 1.0f));
}
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
{
- if(n && k)
- *weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
return -wi + 2.0f * wm * dot(wi, wm);
}
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
{
if(w.z > 0.9999f)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
else
phase *= D_ggx_aniso(wh, alpha);
- if(n && k) {
- /* Apply conductive fresnel term. */
- return phase * fresnel_conductor(dotW_WH, *n, *k);
- }
-
return make_float3(phase, phase, phase);
}
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
- float3 tm, bm;
- make_orthonormals(wm, &tm, &bm);
-
- float2 disk = concentric_sample_disk(randu, randv);
- return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
- const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
- return make_float3(v, v, v);
-}
-
/* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
{
@@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
return saturate(albedo);
}
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+ if(ior < 1.0f) {
+ ior = 1.0f/ior;
+ }
+ a = saturate(a);
+ ior = clamp(ior, 1.0f, 3.0f);
+ float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+ float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+ float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+ float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+ return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
{
float D = D_ggx(normalize(wi+wo), alpha);
float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+ float multiscatter = wo.z * M_1_PI_F;
+
float albedo = mf_ggx_albedo(alpha);
- return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+ return albedo*singlescatter + (1.0f - albedo)*multiscatter;
}
ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
{
- return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
-}
+ float D = D_ggx_aniso(normalize(wi+wo), alpha);
+ float lambda = mf_lambda(wi, alpha);
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
- return M_1_PI_F * wo.z;
+ float multiscatter = wo.z * M_1_PI_F;
+
+ float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+ return albedo*singlescatter + (1.0f - albedo)*multiscatter;
}
ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
{
- float3 wh;
- float fresnel;
- if(wi.z*wo.z > 0.0f) {
- wh = normalize(wi + wo);
- fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
- }
- else {
- wh = normalize(wi + wo*eta);
- fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
- }
+ bool reflective = (wi.z*wo.z > 0.0f);
+
+ float wh_len;
+ float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
if(wh.z < 0.0f)
wh = -wh;
float3 r_wi = (wi.z < 0.0f)? -wi: wi;
- return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+ float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+ float D = D_ggx(wh, alpha);
+ float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+ float multiscatter = fabsf(wo.z * M_1_PI_F);
+ if(reflective) {
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+ float albedo = mf_ggx_albedo(alpha);
+ return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+ }
+ else {
+ float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+ float albedo = mf_ggx_transmission_albedo(alpha, eta);
+ return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+ }
}
/* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -315,13 +320,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
#define MF_MULTI_GLASS
#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-#endif
-
#define MF_PHASE_FUNCTION glossy
#define MF_MULTI_GLOSSY
#include "kernel/closure/bsdf_microfacet_multi_impl.h"
@@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
bsdf->extra->color.x = saturate(bsdf->extra->color.x);
bsdf->extra->color.y = saturate(bsdf->extra->color.y);
bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-
- bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
}
@@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
if(is_zero(bsdf->T))
bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ if(is_zero(bsdf->T))
+ bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf)
+{
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
return make_float3(0.0f, 0.0f, 0.0f);
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
float3 X, Y, Z;
Z = bsdf->N;
@@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
else
*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
- return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+ return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
}
ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
*omega_in = 2*dot(Z, I)*Z - I;
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+ *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
return LABEL_REFLECT|LABEL_SINGULAR;
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
if(is_aniso)
make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
@@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
float3 localO;
- *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+ *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
if(is_aniso)
*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
else
@@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
*eval *= *pdf;
*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
@@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
}
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+ bsdf->alpha_y = bsdf->alpha_x;
+ bsdf->ior = max(0.0f, bsdf->ior);
+ bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+ bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+ bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
+}
+
ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
@@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
- return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color);
}
ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
return make_float3(0.0f, 0.0f, 0.0f);
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
float3 X, Y, Z;
Z = bsdf->N;
make_orthonormals(Z, &X, &Y);
@@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
- return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
}
ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
}
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
make_orthonormals(Z, &X, &Y);
float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
float3 localO;
- *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
*eval *= *pdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 8054fa8e849..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,19 +26,16 @@
* the balance heuristic isn't necessarily optimal anymore.
*/
ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
- float3 wi,
- float3 wo,
- const bool wo_outside,
- const float3 color,
- const float alpha_x,
- const float alpha_y,
- ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
- , const float eta
-#elif defined(MF_MULTI_GLOSSY)
- , float3 *n, float3 *k
-#endif
-)
+ float3 wi,
+ float3 wo,
+ const bool wo_outside,
+ const float3 color,
+ const float alpha_x,
+ const float alpha_y,
+ ccl_addr_space uint *lcg_state,
+ const float eta,
+ bool use_fresnel,
+ const float3 cspec0)
{
/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
bool swapped = false;
@@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
/* Analytically compute single scattering for lower noise. */
float3 eval;
+ float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+ const float3 wh = normalize(wi+wo);
#ifdef MF_MULTI_GLASS
eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
if(wo_outside)
eval *= -lambda_r / (shadowing_lambda - lambda_r);
else
eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#elif defined(MF_MULTI_DIFFUSE)
- /* Diffuse has no special closed form for the single scattering bounce */
- eval = make_float3(0.0f, 0.0f, 0.0f);
#else /* MF_MULTI_GLOSSY */
- const float3 wh = normalize(wi+wo);
const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
float val = G2 * 0.25f / wi.z;
if(alpha.x == alpha.y)
val *= D_ggx(wh, alpha.x);
else
val *= D_ggx_aniso(wh, alpha);
- if(n && k) {
- eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
- }
- else {
- eval = make_float3(val, val, val);
- }
+ eval = make_float3(val, val, val);
#endif
+ float F0 = fresnel_dielectric_cos(1.0f, eta);
+ if(use_fresnel) {
+ throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+
+ eval *= throughput;
+ }
+
float3 wr = -wi;
float hr = 1.0f;
float C1_r = 1.0f;
float G1_r = 0.0f;
bool outside = true;
- float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
for(int order = 0; order < 10; order++) {
- /* Sample microfacet height and normal */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+ /* Sample microfacet height. */
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
break;
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
-
-#ifdef MF_MULTI_DIFFUSE
- if(order == 0) {
- /* Compute single-scattering for diffuse. */
- const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
- eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
+ /* Sample microfacet normal. */
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
+
+#ifdef MF_MULTI_GLASS
+ if(order == 0 && use_fresnel) {
+ /* Evaluate amount of scattering towards wo on this microfacet. */
+ float3 phase;
+ if(outside)
+ phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
+ else
+ phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta);
+
+ eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
}
#endif
if(order > 0) {
@@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
else
phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
- phase = mf_eval_phase_diffuse(wo, wm);
#else /* MF_MULTI_GLOSSY */
- phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+ phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
#endif
eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
}
@@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
/* Bounce from the microfacet. */
#ifdef MF_MULTI_GLASS
bool next_outside;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float3 wi_prev = -wr;
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
outside = !outside;
wr = -wr;
hr = -hr;
}
-#elif defined(MF_MULTI_DIFFUSE)
- wr = mf_sample_phase_diffuse(wm,
- lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state));
+
+ if(use_fresnel && !next_outside) {
+ throughput *= color;
+ }
+ else if(use_fresnel && order > 0) {
+ throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+ }
#else /* MF_MULTI_GLOSSY */
- wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+ if(use_fresnel && order > 0) {
+ throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+ }
+ wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
lambda_r = mf_lambda(wr, alpha);
- throughput *= color;
+ if(!use_fresnel)
+ throughput *= color;
C1_r = mf_C1(hr);
G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
* escaped the surface in wo. The function returns the throughput between wi and wo.
* Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
*/
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
- , const float eta
-#elif defined(MF_MULTI_GLOSSY)
- , float3 *n, float3 *k
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+ float3 wi,
+ float3 *wo,
+ const float3 color,
+ const float alpha_x,
+ const float alpha_y,
+ ccl_addr_space uint *lcg_state,
+ const float eta,
+ bool use_fresnel,
+ const float3 cspec0)
{
const float2 alpha = make_float2(alpha_x, alpha_y);
@@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
float G1_r = 0.0f;
bool outside = true;
+ float F0 = fresnel_dielectric_cos(1.0f, eta);
+ if(use_fresnel) {
+ throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
+ }
+
int order;
for(order = 0; order < 10; order++) {
/* Sample microfacet height. */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
/* The random walk has left the surface. */
*wo = outside? wr: -wr;
return throughput;
}
/* Sample microfacet normal. */
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
/* First-bounce color is already accounted for in mix weight. */
- if(order > 0)
+ if(!use_fresnel && order > 0)
throughput *= color;
/* Bounce from the microfacet. */
#ifdef MF_MULTI_GLASS
bool next_outside;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float3 wi_prev = -wr;
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
hr = -hr;
wr = -wr;
outside = !outside;
}
-#elif defined(MF_MULTI_DIFFUSE)
- wr = mf_sample_phase_diffuse(wm,
- lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state));
+
+ if(use_fresnel) {
+ if(!next_outside) {
+ throughput *= color;
+ }
+ else {
+ float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+
+ if(order == 0)
+ throughput = t_color;
+ else
+ throughput *= t_color;
+ }
+ }
#else /* MF_MULTI_GLOSSY */
- wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+ if(use_fresnel) {
+ float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+
+ if(order == 0)
+ throughput = t_color;
+ else
+ throughput *= t_color;
+ }
+ wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
/* Update random walk parameters. */
@@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
}
#undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
#undef MF_MULTI_GLOSSY
#undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float roughness;
float a;
float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
new file mode 100644
index 00000000000..f8ca64293b0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
+#define __BSDF_PRINCIPLED_DIFFUSE_H__
+
+/* DISNEY PRINCIPLED DIFFUSE BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledDiffuseBsdf {
+ SHADER_CLOSURE_BASE;
+
+ float roughness;
+} PrincipledDiffuseBsdf;
+
+ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
+ float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+ float NdotL = max(dot(N, L), 0.0f);
+ float NdotV = max(dot(N, V), 0.0f);
+
+ if(NdotL < 0 || NdotV < 0) {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+
+ float LdotH = dot(L, H);
+
+ float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV);
+ const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness;
+ float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV);
+
+ float value = M_1_PI_F * NdotL * Fd;
+
+ return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
+{
+ bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID;
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+ const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+ const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+ return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+ float3 N = bsdf->N;
+ float3 V = I; // outgoing
+ float3 L = omega_in; // incoming
+ float3 H = normalize(L + V);
+
+ if(dot(N, omega_in) > 0.0f) {
+ *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+ return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf);
+ }
+ else {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
+ float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+ float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+ float3 *domega_in_dy, float *pdf)
+{
+ const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+ float3 N = bsdf->N;
+
+ sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+ if(dot(Ng, *omega_in) > 0) {
+ float3 H = normalize(I + *omega_in);
+
+ *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+ // TODO: find a better approximation for the diffuse bounce
+ *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+ *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+ }
+ else {
+ *pdf = 0.0f;
+ }
+ return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
new file mode 100644
index 00000000000..f4476bfecd0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_SHEEN_H__
+#define __BSDF_PRINCIPLED_SHEEN_H__
+
+/* DISNEY PRINCIPLED SHEEN BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledSheenBsdf {
+ SHADER_CLOSURE_BASE;
+} PrincipledSheenBsdf;
+
+ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
+ float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+ float NdotL = dot(N, L);
+ float NdotV = dot(N, V);
+
+ if(NdotL < 0 || NdotV < 0) {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+
+ float LdotH = dot(L, H);
+
+ float value = schlick_fresnel(LdotH) * NdotL;
+
+ return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+{
+ bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+ float3 N = bsdf->N;
+ float3 V = I; // outgoing
+ float3 L = omega_in; // incoming
+ float3 H = normalize(L + V);
+
+ if(dot(N, omega_in) > 0.0f) {
+ *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+ return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+ }
+ else {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
+ float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+ float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+ float3 *domega_in_dy, float *pdf)
+{
+ const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+ float3 N = bsdf->N;
+
+ sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+ if(dot(Ng, *omega_in) > 0) {
+ float3 H = normalize(I + *omega_in);
+
+ *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+ // TODO: find a better approximation for the diffuse bounce
+ *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+ *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+ }
+ else {
+ *pdf = 0.0f;
+ }
+ return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float size;
float smooth;
} ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b0c5280b6cb..3dc15d5791c 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k
return(Rparl2 + Rperp2) * 0.5f;
}
+ccl_device float schlick_fresnel(float u)
+{
+ float m = clamp(1.0f - u, 0.0f, 1.0f);
+ float m2 = m * m;
+ return m2 * m2 * m; // pow(m, 5)
+}
+
ccl_device float smooth_step(float edge0, float edge1, float x)
{
float result;
@@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x)
return result;
}
+/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
+ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) {
+ /* Calculate the fresnel interpolation factor
+ * The value from fresnel_dielectric_cos(...) has to be normalized because
+ * the cspec0 keeps the F0 color
+ */
+ float F0_norm = 1.0f / (1.0f - F0);
+ float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+
+ /* Blend between white and a specular color with respect to the fresnel */
+ return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+}
+
CCL_NAMESPACE_END
#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd861a9..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf {
float d;
float texture_blur;
float albedo;
- float3 N;
+ float roughness;
} Bssrdf;
/* Planar Truncated Gaussian
@@ -360,10 +360,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
{
if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
/* revert to diffuse BSDF if radius too small */
- DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
- bsdf->N = bssrdf->N;
- int flag = bsdf_diffuse_setup(bsdf);
- bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ int flag;
+#ifdef __PRINCIPLED__
+ if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+ float roughness = bssrdf->roughness;
+ float3 N = bssrdf->N;
+ float3 weight = bssrdf->weight;
+ float sample_weight = bssrdf->sample_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf;
+
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+ bsdf->weight = weight;
+ bsdf->sample_weight = sample_weight;
+ flag = bsdf_principled_diffuse_setup(bsdf);
+ bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+ }
+ else
+#endif /* __PRINCIPLED__ */
+ {
+ DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
+ bsdf->N = bssrdf->N;
+ flag = bsdf_diffuse_setup(bsdf);
+ bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ }
+
return flag;
}
else {
@@ -371,7 +393,9 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
bssrdf->sharpness = saturate(bssrdf->sharpness);
bssrdf->type = type;
- if(type == CLOSURE_BSSRDF_BURLEY_ID) {
+ if(type == CLOSURE_BSSRDF_BURLEY_ID ||
+ type == CLOSURE_BSSRDF_PRINCIPLED_ID)
+ {
bssrdf_burley_setup(bssrdf);
}
@@ -385,7 +409,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
bssrdf_cubic_sample(sc, xi, r, h);
else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
bssrdf_gaussian_sample(sc, xi, r, h);
- else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+ else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
bssrdf_burley_sample(sc, xi, r, h);
}
@@ -395,7 +419,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
return bssrdf_cubic_pdf(sc, r);
else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
return bssrdf_gaussian_pdf(sc, r);
- else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+ else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
return bssrdf_burley_pdf(sc, r);
}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+ ccl_global float *buffers[9];
+#else
+ long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *features,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = fabsf(ccl_get_feature(buffer, 0));
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] -= mean[i];
+ }
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *scales,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+ scale[0] = 1.0f/max(scale[0], 0.01f);
+ scale[1] = 1.0f/max(scale[1], 0.01f);
+ scale[2] = 1.0f/max(scale[2], 0.01f);
+ scale[6] = 1.0f/max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+ int pass_stride)
+{
+ return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+ int rank,
+ const ccl_global float *ccl_restrict transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ const ccl_global float *ccl_restrict p_buffer,
+ int2 q_pixel,
+ const ccl_global float *ccl_restrict q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ const ccl_global float *ccl_restrict transform,
+ int stride)
+{
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3185330994c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ __m128 y4 = _mm_set1_ps(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+ __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (pixel.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *features,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = x;
+ features[1] = y;
+ features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_sub_ps(features[i], mean[i]);
+ }
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *scales,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+ scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+ scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
+
+ __m128 diff, scale;
+ diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[3] = _mm_mask_ps(scale, active_pixels);
+
+ scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+ diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+ scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+ scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+ scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+ scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+ scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+ scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+# include "kernel/filter/filter_transform_gpu.h"
+#else
+# ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_transform_sse.h"
+# else
+# include "kernel/filter/filter_transform.h"
+# endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+# include "kernel/filter/filter_nlm_cpu.h"
+#else
+# include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..3e752bce68f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+ const float *ccl_restrict weight_image,
+ const float *ccl_restrict variance_image,
+ float *difference_image,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+#ifdef __KERNEL_SSE3__
+ int aligned_lowx = (rect.x & ~(3));
+ int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+ for(int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+ for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+ _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+ }
+#else
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] += difference_image[y1*w+x];
+ }
+#endif
+ }
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] *= 1.0f/(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ }
+ for(int dx = -f; dx <= f; dx++) {
+ int pos_dx = max(0, dx);
+ int neg_dx = min(0, dx);
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+ out_image[y*w+x] += difference_image[y*w+dx+x];
+ }
+ }
+ }
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict image,
+ float *out_image,
+ float *accum_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+ accum_image[y*w+x] += weight;
+ out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+ int y = fy + filter_rect.y;
+ for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ l_transform, l_rank,
+ weight, l_XtWX, l_XtWY, 0);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+ const float *ccl_restrict accum_image,
+ int4 rect,
+ int w)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] /= accum_image[y*w+x];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..2c5ac807051
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ ccl_global float *difference_image,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int y1 = low; y1 < high; y1++) {
+ sum += difference_image[y1*w+x];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict image,
+ ccl_global float *out_image,
+ ccl_global float *accum_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ if(out_image) {
+ accum_image[y*w+x] += sum;
+ out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+ }
+ else {
+ accum_image[y*w+x] = sum;
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride,
+ int localIdx)
+{
+ int y = fy + filter_rect.y;
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x, y,
+ filter_rect.z*filter_rect.w,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ transform, rank,
+ weight, XtWX, XtWY,
+ localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ ccl_global float *out_image,
+ const ccl_global float *ccl_restrict accum_image,
+ int4 rect, int w)
+{
+ out_image[y*w+x] /= accum_image[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..a0b89c1111f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ int x, int y,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+
+ int offset = tiles->offsets[tile];
+ int stride = tiles->strides[tile];
+ const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+ center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample+1)/2;
+ int even_sample = sample/2;
+ if(use_split_variance) {
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ }
+ varA /= max(odd_sample - 1, 1);
+ varB /= max(even_sample - 1, 1);
+
+ sampleVariance[idx] = 0.5f*(varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset, int v_offset,
+ int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 rect, int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+ ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ mean[idx] = center_buffer[m_offset] / sample;
+ if (sample > 1) {
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *out,
+ int4 rect,
+ int pass_stride)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ int n = 0;
+ float values[25];
+ for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+ for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+ int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ /* Find the position of L. */
+ int i;
+ for(i = 0; i < n; i++) {
+ if(values[i] > L) break;
+ }
+ /* Make space for L by shifting all following values to the right. */
+ for(int j = n; j > i; j--) {
+ values[j] = values[j-1];
+ }
+ /* Insert L. */
+ values[i] = L;
+ n++;
+ }
+ }
+
+ int idx = (y-rect.y)*buffer_w + (x-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ float ref = 2.0f*values[(int)(n*0.75f)];
+ float fac = 1.0f;
+ if(L > ref) {
+ /* The pixel appears to be an outlier.
+ * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+ * should actually be at the reference value:
+ * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+ * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+ */
+ float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+ if(L - 3*stddev < ref) {
+ /* The pixel is an outlier, so negate the depth value to mark it as one.
+ * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+ depth[idx] = -depth[idx];
+ fac = ref/L;
+ variance[idx ] *= fac*fac;
+ variance[idx + pass_stride] *= fac*fac;
+ variance[idx+2*pass_stride] *= fac*fac;
+ }
+ }
+ out[idx ] = fac*image[idx];
+ out[idx + pass_stride] = fac*image[idx + pass_stride];
+ out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 rect, int r)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
+ if(variance) {
+ if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+ for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+ int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+ values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for(int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for(j = i-1; j >= 0 && values[j] > v; j--)
+ values[j+1] = values[j];
+ values[j+1] = v;
+ }
+ variance[idx] = values[(7*numValues)/8];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..25a3025056c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ int storage_stride,
+ int dx, int dy,
+ int w, int h,
+ int pass_stride,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ float weight,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int localIdx)
+{
+ if(weight < 1e-3f) {
+ return;
+ }
+
+ int p_offset = y *w + x;
+ int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+ float design_row[DENOISE_FEATURES+1];
+#endif
+
+ float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+ /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+ if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+ return;
+ }
+
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
+
+ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ int storage_stride,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 buffer_params,
+ int sample)
+{
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+ if(XtWX[0] < 1e-3f) {
+ /* There is not enough information to determine a denoised result.
+ * As a fallback, keep the original value of the pixel. */
+ return;
+ }
+
+ /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+ * In case the solution of the linear model fails due to numerical issues,
+ * fall back to this value. */
+ float3 mean_color = XtWY[0]/XtWX[0];
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+ float3 final_color = XtWY[0];
+ if(!isfinite3_safe(final_color)) {
+ final_color = mean_color;
+ }
+
+ /* Clamp pixel value to positive values. */
+ final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ final_color *= sample;
+ if(buffer_params.w) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ /* Temporary storage, used in different steps of the algorithm. */
+ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ float tempvector[2*DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float *feature_scale = tempvector;
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float* feature_matrix = tempmatrix;
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ }
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int radius, float pca_threshold,
+ int transform_stride, int localIdx)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+ float features[DENOISE_FEATURES];
+#endif
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+ const ccl_global float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int j = 0; j < (*rank); j++) {
+ transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..30dc2969b11
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ __m128 features[DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ __m128 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ }
+
+ __m128 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale);
+
+ __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 8888000f0e6..5c3b0ee3c15 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I
r_ext = mw_extension + r_curr;
#ifdef __KERNEL_SSE__
const float3 p_curr_sq = p_curr * p_curr;
- const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
+ const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
float d = dxxx.x;
#else
float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
- return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+ float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+ return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..9ed16aceb55 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -220,8 +220,16 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
#ifdef __SHADOW_TRICKS__
L->path_total = make_float3(0.0f, 0.0f, 0.0f);
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
- L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_throughput = 0.0f;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_depth = 0.0f;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +285,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
- float3 ao,
- int bounce)
+ float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@@ -302,31 +310,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf;
+ L->path_total += light;
+ L->path_total_shaded += ao * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf;
#endif
}
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ BsdfEval *bsdf_eval,
+ float3 shadow,
+ float shadow_fac,
+ bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +372,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf_eval->sum_no_mis;
+ L->path_total += light;
+ L->path_total_shaded += shadow * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf_eval->sum_no_mis;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf_eval->sum_no_mis;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@@ -393,11 +419,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * value;
- if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
- L->path_total_shaded += throughput * value;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * value;
+ if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+ L->path_total_shaded += throughput * value;
+ }
}
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_albedo += state->denoising_feature_weight * value;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,29 +587,79 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+ kernel_assert(L->use_light_pass);
+
+ *clean = L->emission + L->background;
+ *noisy = L->direct_scatter + L->indirect_scatter;
+
+# define ADD_COMPONENT(flag, component) \
+ if(kernel_data.film.denoising_flags & flag) \
+ *clean += component; \
+ else \
+ *noisy += component;
+
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
+# undef ADD_COMPONENT
+#else
+ *noisy = L->emission;
+ *clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+ *noisy = ensure_finite3(*noisy);
+ *clean = ensure_finite3(*clean);
+}
+
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
+#ifdef __SPLIT_KERNEL__
+# define safe_float3_add(f, v) \
+ do { \
+ ccl_global float *p = (ccl_global float*)(&(f)); \
+ atomic_add_and_fetch_float(p+0, (v).x); \
+ atomic_add_and_fetch_float(p+1, (v).y); \
+ atomic_add_and_fetch_float(p+2, (v).z); \
+ } while(0)
+#else
+# define safe_float3_add(f, v) (f) += (v)
+#endif /* __SPLIT_KERNEL__ */
+
#ifdef __PASSES__
- L->direct_diffuse += L_sample->direct_diffuse*fac;
- L->direct_glossy += L_sample->direct_glossy*fac;
- L->direct_transmission += L_sample->direct_transmission*fac;
- L->direct_subsurface += L_sample->direct_subsurface*fac;
- L->direct_scatter += L_sample->direct_scatter*fac;
-
- L->indirect_diffuse += L_sample->indirect_diffuse*fac;
- L->indirect_glossy += L_sample->indirect_glossy*fac;
- L->indirect_transmission += L_sample->indirect_transmission*fac;
- L->indirect_subsurface += L_sample->indirect_subsurface*fac;
- L->indirect_scatter += L_sample->indirect_scatter*fac;
-
- L->background += L_sample->background*fac;
- L->ao += L_sample->ao*fac;
- L->shadow += L_sample->shadow*fac;
+ safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
+ safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
+ safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
+ safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
+ safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
+
+ safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
+ safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
+ safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
+ safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
+ safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
+
+ safe_float3_add(L->background, L_sample->background*fac);
+ safe_float3_add(L->ao, L_sample->ao*fac);
+ safe_float3_add(L->shadow, L_sample->shadow*fac);
+# ifdef __SPLIT_KERNEL__
+ atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
+# else
L->mist += L_sample->mist*fac;
-#endif
- L->emission += L_sample->emission * fac;
+# endif /* __SPLIT_KERNEL__ */
+#endif /* __PASSES__ */
+ safe_float3_add(L->emission, L_sample->emission*fac);
+
+#undef safe_float3_add
}
#ifdef __SHADOW_TRICKS__
@@ -595,16 +677,17 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
- ccl_addr_space float* L_transparent)
+ float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
- *L_transparent = shadow;
- L_sum = make_float3(0.0f, 0.0f, 0.0f);
+ *alpha = 1.0f - L->shadow_throughput * shadow;
+ L_sum = L->shadow_radiance_sum;
}
else {
- L_sum = L->shadow_color * shadow;
+ L_sum = L->shadow_background_color * L->shadow_throughput * shadow +
+ L->shadow_radiance_sum;
}
return L_sum;
}
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..93934ee6b38 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -195,7 +195,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
ix = wrap_clamp(ix, width);
iy = wrap_clamp(iy, height);
@@ -222,7 +222,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
nix = wrap_clamp(ix+1, width);
niy = wrap_clamp(iy+1, height);
@@ -265,7 +265,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
pix = wrap_clamp(ix-1, width);
piy = wrap_clamp(iy-1, height);
@@ -335,7 +335,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
ix = wrap_clamp(ix, width);
iy = wrap_clamp(iy, height);
@@ -374,7 +374,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
nix = wrap_clamp(ix+1, width);
niy = wrap_clamp(iy+1, height);
@@ -449,7 +449,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
pix = wrap_clamp(ix-1, width);
piy = wrap_clamp(iy-1, height);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..38708f7ff0b 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,11 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..4836c290312 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
+#define ATTR_FALLTHROUGH
+
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index c9c97ea977e..f95f0d98c52 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -19,6 +19,10 @@
#ifndef __KERNEL_GLOBALS_H__
#define __KERNEL_GLOBALS_H__
+#ifdef __KERNEL_CPU__
+# include "util/util_vector.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -38,12 +42,12 @@ struct Intersection;
struct VolumeStep;
typedef struct KernelGlobals {
- texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
- texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
- texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU];
- texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
- texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];
- texture_image_half texture_half_images[TEX_NUM_HALF_CPU];
+ vector<texture_image_float4> texture_float4_images;
+ vector<texture_image_uchar4> texture_byte4_images;
+ vector<texture_image_half4> texture_half4_images;
+ vector<texture_image_float> texture_float_images;
+ vector<texture_image_uchar> texture_byte_images;
+ vector<texture_image_half> texture_half_images;
# define KERNEL_TEX(type, ttype, name) ttype name;
# define KERNEL_IMAGE_TEX(type, ttype, name)
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 0352c58037d..90747e09357 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -20,18 +20,19 @@
ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
{
+ const int texture_type = kernel_tex_type(id);
/* Float4 */
- if(id < TEX_START_BYTE4_OPENCL) {
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
return kernel_tex_fetch(__tex_image_float4_packed, offset);
}
/* Byte4 */
- else if(id < TEX_START_FLOAT_OPENCL) {
+ else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
float f = 1.0f/255.0f;
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
}
/* Float */
- else if(id < TEX_START_BYTE_OPENCL) {
+ else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
float f = kernel_tex_fetch(__tex_image_float_packed, offset);
return make_float4(f, f, f, 1.0f);
}
@@ -63,23 +64,34 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
return x - (float)i;
}
+ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+{
+ return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+}
+
+ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+{
+ if(info.w & (1 << 1)) {
+ return EXTENSION_REPEAT;
+ }
+ else if(info.w & (1 << 2)) {
+ return EXTENSION_EXTEND;
+ }
+ else {
+ return EXTENSION_CLIP;
+ }
+}
+
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
{
uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
uint width = info.x;
uint height = info.y;
uint offset = info.z;
-
- /* Image Options */
- uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
- uint extension;
- if(info.w & (1 << 1))
- extension = EXTENSION_REPEAT;
- else if(info.w & (1 << 2))
- extension = EXTENSION_EXTEND;
- else
- extension = EXTENSION_CLIP;
-
+ /* Decode image options. */
+ uint interpolation = kernel_decode_image_interpolation(info);
+ uint extension = kernel_decode_image_extension(info);
+ /* Actual sampling. */
float4 r;
int ix, iy, nix, niy;
if(interpolation == INTERPOLATION_CLOSEST) {
@@ -132,7 +144,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
}
-
return r;
}
@@ -144,17 +155,10 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
uint height = info.y;
uint offset = info.z;
uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
-
- /* Image Options */
- uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
- uint extension;
- if(info.w & (1 << 1))
- extension = EXTENSION_REPEAT;
- else if(info.w & (1 << 2))
- extension = EXTENSION_EXTEND;
- else
- extension = EXTENSION_CLIP;
-
+ /* Decode image options. */
+ uint interpolation = kernel_decode_image_interpolation(info);
+ uint extension = kernel_decode_image_extension(info);
+ /* Actual sampling. */
float4 r;
int ix, iy, iz, nix, niy, niz;
if(interpolation == INTERPOLATION_CLOSEST) {
@@ -171,7 +175,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
if(extension == EXTENSION_CLIP) {
if(x < 0.0f || y < 0.0f || z < 0.0f ||
x > 1.0f || y > 1.0f || z > 1.0f)
- {
+ {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
}
@@ -198,12 +202,13 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
niz = svm_image_texture_wrap_periodic(iz+1, depth);
}
else {
- if(extension == EXTENSION_CLIP)
+ if(extension == EXTENSION_CLIP) {
if(x < 0.0f || y < 0.0f || z < 0.0f ||
x > 1.0f || y > 1.0f || z > 1.0f)
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
+ }
/* Fall through. */
/* EXTENSION_EXTEND */
nix = svm_image_texture_wrap_clamp(ix+1, width);
@@ -224,8 +229,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
-
}
-
return r;
}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 67546131746..f5855757d3f 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p)
return (x + jx)*invN;
}
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
+ccl_device_inline int cmj_isqrt(int value)
{
- kernel_assert(s < N);
-
#if defined(__KERNEL_CUDA__)
- int m = float_to_int(__fsqrt_ru(N));
+ return float_to_int(__fsqrt_ru(value));
+#elif defined(__KERNEL_GPU__)
+ return float_to_int(sqrtf(value));
#else
- int m = float_to_int(sqrtf(N));
+ /* This is a work around for fast-math on CPU which might replace sqrtf()
+ * with am approximated version.
+ */
+ return float_to_int(sqrtf(value) + 1e-6f);
#endif
+}
+
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+{
+ kernel_assert(s < N);
+
+ int m = cmj_isqrt(N);
int n = (N - 1)/m + 1;
float invN = 1.0f/N;
float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
- float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+ float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..9cd7ffb181d 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,140 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+ kernel_write_pass_float(buffer, sample, value);
+
+ /* The online one-pass variance update that's used for the megakernel can't easily be implemented
+ * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+1, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float(buffer+1, sample, 0.0f);
+ }
+ else {
+ float new_mean = buffer[0] * (1.0f / (sample + 1));
+ float old_mean = (buffer[0] - value) * (1.0f / sample);
+ kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+# if defined(__SPLIT_KERNEL__)
+# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+# else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+ buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+ buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+ buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+# endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+ kernel_write_pass_float3_unaligned(buffer, sample, value);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ else {
+ float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+ float3 new_mean = sum * (1.0f / (sample + 1));
+ float3 old_mean = (sum - value) * (1.0f / sample);
+ kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, float path_total, float path_total_shaded)
+{
+ if(kernel_data.film.pass_denoising_data == 0)
+ return;
+
+ buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+ path_total = ensure_finite(path_total);
+ path_total_shaded = ensure_finite(path_total_shaded);
+
+ kernel_write_pass_float(buffer, sample/2, path_total);
+ kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+ float value = path_total_shaded / max(path_total, 1e-7f);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+2, sample/2, value*value);
+# else
+ if(sample < 2) {
+ kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+ }
+ else {
+ float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+ float new_value = buffer[1] / max(buffer[0], 1e-7f);
+ kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+ }
+# endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+ ShaderData *sd,
+ ccl_addr_space PathState *state,
+ PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight == 0.0f) {
+ return;
+ }
+
+ L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+ /* Skip implicitly transparent surfaces. */
+ if(sd->flag & SD_HAS_ONLY_VOLUME) {
+ return;
+ }
+
+ float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+ float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+ float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ continue;
+
+ /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+ normal += sc->N * sc->sample_weight;
+ sum_weight += sc->sample_weight;
+ if(!bsdf_is_specular_like(sc)) {
+ albedo += sc->weight;
+ sum_nonspecular_weight += sc->sample_weight;
+ }
+ }
+
+ /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+ if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+ if(sum_weight != 0.0f) {
+ normal /= sum_weight;
+ }
+ L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+ L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+ state->denoising_feature_weight = 0.0f;
+ }
+#else
+ (void) kg;
+ (void) sd;
+ (void) state;
+ (void) L;
+#endif /* __DENOISING_FEATURES__ */
+}
+
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@@ -199,5 +333,88 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+ if(L) {
+ float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ L_sum = path_radiance_clamp_and_sum(kg, L);
+ }
+
+ kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+ kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+# ifdef __SHADOW_TRICKS__
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+# else
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+# endif
+ if(kernel_data.film.pass_denoising_clean) {
+ float3 noisy, clean;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ noisy = L_sum;
+ clean = make_float3(0.0f, 0.0f, 0.0f);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ path_radiance_split_denoising(kg, L, &noisy, &clean);
+ }
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, noisy);
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, clean);
+ }
+ else {
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, ensure_finite3(L_sum));
+ }
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, L->denoising_normal);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, L->denoising_albedo);
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, L->denoising_depth);
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+ else {
+ kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, 0.0f);
+
+ if(kernel_data.film.pass_denoising_clean) {
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index e7957042182..c340b3bc968 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
- PathState *state,
+ ccl_addr_space PathState *state,
RNG *rng,
float3 throughput,
float3 ao_alpha)
@@ -90,14 +90,16 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
+#ifndef __SPLIT_KERNEL__
+
ccl_device void kernel_path_indirect(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
@@ -364,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, sd, state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -403,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
if(kernel_data.integrator.use_direct_light) {
int all = (kernel_data.integrator.sample_all_lights_indirect) ||
(state->flag & PATH_RAY_SHADOW_CATCHER);
@@ -417,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
L,
all);
}
-#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif /* defined(__EMISSION__) */
if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
break;
@@ -425,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -515,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@@ -547,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -557,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@@ -575,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -589,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+ kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -621,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -638,11 +643,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_background_color =
+ indirect_background(kg, &emission_sd, &state, &ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
}
else {
@@ -675,7 +685,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@@ -693,7 +703,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -713,10 +723,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+ kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -727,7 +739,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
- &L,
+ L,
&state,
rng,
&ray,
@@ -740,15 +752,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
- kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
- if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
- kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+ kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@@ -758,7 +770,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
- &L,
+ L,
&throughput);
}
else {
@@ -767,24 +779,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -805,18 +808,21 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
+#endif /* __SPLIT_KERNEL__ */
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 36fd6c95fe7..77d4f1df447 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
- PathState *state,
+ ccl_addr_space PathState *state,
RNG *rng,
float3 throughput)
{
@@ -56,29 +56,48 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
}
+#ifndef __SPLIT_KERNEL__
/* bounce off surface and integrate indirect light */
ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
- if(!CLOSURE_IS_BSDF(sc->type))
- continue;
/* transparency is not handled here, but in outer loop */
- if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
+ }
int num_samples;
@@ -110,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
- &bsdf_ray))
+ &bsdf_ray,
+ sum_sample_weight))
{
continue;
}
@@ -242,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -329,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@@ -337,11 +362,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
float num_samples_inv = 1.0f/num_samples;
for(int j = 0; j < num_samples; j++) {
- /* workaround to fix correlation bug in T38710, can find better solution
- * in random number generator later, for now this is done here to not impact
- * performance of rendering without volumes */
- RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
PathState ps = state;
Ray pray = ray;
float3 tp = throughput;
@@ -352,8 +372,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* scatter sample. if we use distance sampling and take just one
* sample for direct and indirect light, we could share this
* computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
@@ -366,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -377,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -411,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+ kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -435,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@@ -466,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -479,13 +499,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
- state.catcher_object = sd.object;
- if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
- }
+ state.flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
+ state.catcher_object = sd.object;
+ if(!kernel_data.background.transparent) {
+ L->shadow_background_color =
+ indirect_background(kg, &emission_sd, &state, &ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
else {
state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
@@ -513,13 +536,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -543,10 +566,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+ kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@@ -554,7 +579,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
- &L, &state, rng, &ray, throughput);
+ L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -567,13 +592,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
- &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+ &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
- &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+ &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -602,24 +627,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -640,20 +656,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
+#endif /* __SPLIT_KERNEL__ */
+
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..5d92fd12201 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+ state->denoising_feature_weight = 1.0f;
+ }
+ else {
+ state->denoising_feature_weight = 0.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@@ -128,6 +138,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __DENOISING_FEATURES__
+ if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+ state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+ }
+#endif
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 076c82f3853..dcb577e176f 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
/* branched path tracing: connect path directly to position on one or more lights and add it to L */
ccl_device_noinline void kernel_branched_path_surface_connect_light(
KernelGlobals *kg,
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
- Ray *ray)
+ ccl_addr_space Ray *ray,
+ float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+#ifdef __DENOISING_FEATURES__
+ state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
/* modify path state */
path_state_next(kg, state, label);
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
+ if(is_zero(dir))
+ return make_float2(0.0f, 0.0f);
+
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 96bc636d5ac..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index(
return my_gqidx;
}
+ccl_device int dequeue_ray_index(
+ int queue_number,
+ ccl_global int *queues,
+ int queue_size,
+ ccl_global int *queue_index)
+{
+ int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+ if(index < 0) {
+ return QUEUE_EMPTY_SLOT;
+ }
+
+ return queues[index + queue_number * queue_size];
+}
+
CCL_NAMESPACE_END
#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index d4f0caff5de..e8a912ccc0b 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -20,14 +20,15 @@ CCL_NAMESPACE_BEGIN
#ifdef __SOBOL__
-/* skip initial numbers that are not as well distributed, especially the
+/* Skip initial numbers that are not as well distributed, especially the
* first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination */
+ * path termination.
+ */
#define SOBOL_SKIP 64
-/* High Dimensional Sobol */
+/* High Dimensional Sobol. */
-/* van der corput radical inverse */
+/* Van der Corput radical inverse. */
ccl_device uint van_der_corput(uint bits)
{
bits = (bits << 16) | (bits >> 16);
@@ -38,58 +39,63 @@ ccl_device uint van_der_corput(uint bits)
return bits;
}
-/* sobol radical inverse */
+/* Sobol radical inverse. */
ccl_device uint sobol(uint i)
{
uint r = 0;
-
- for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
- if(i & 1)
+ for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) {
+ if(i & 1) {
r ^= v;
-
+ }
+ }
return r;
}
-/* inverse of sobol radical inverse */
+/* Inverse of sobol radical inverse. */
ccl_device uint sobol_inverse(uint i)
{
const uint msb = 1U << 31;
uint r = 0;
-
- for(uint v = 1; i; i <<= 1, v ^= v << 1)
- if(i & msb)
+ for(uint v = 1; i; i <<= 1, v ^= v << 1) {
+ if(i & msb) {
r ^= v;
-
+ }
+ }
return r;
}
-/* multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
+/* Multidimensional sobol with generator matrices
+ * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively.
+ */
ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
uint i = index;
-
- for(uint j = 0; i; i >>= 1, j++)
- if(i & 1)
+ for(uint j = 0; i; i >>= 1, j++) {
+ if(i & 1) {
result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
-
+ }
+ }
return result;
}
-/* lookup index and x/y coordinate, assumes m is a power of two */
-ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
+/* Lookup index and x/y coordinate, assumes m is a power of two. */
+ccl_device uint sobol_lookup(const uint m,
+ const uint frame,
+ const uint ex,
+ const uint ey,
+ uint *x, uint *y)
{
- /* shift is constant per frame */
+ /* Shift is constant per frame. */
const uint shift = frame << (m << 1);
const uint sobol_shift = sobol(shift);
- /* van der Corput is its own inverse */
+ /* Van der Corput is its own inverse. */
const uint lower = van_der_corput(ex << (32 - m));
- /* need to compensate for ey difference and shift */
+ /* Need to compensate for ey difference and shift. */
const uint sobol_lower = sobol(lower);
- const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
+ const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */
const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
- /* only use m upper bits for the index (m is a power of two) */
+ /* Only use m upper bits for the index (m is a power of two). */
const uint sobol_result = delta | (delta >> m);
const uint upper = sobol_inverse(sobol_result);
const uint index = shift | upper | lower;
@@ -98,11 +104,14 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
return index;
}
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
- /* correlated multi-jittered */
+ /* Correlated multi-jitter. */
int p = *rng + dimension;
return cmj_sample_1D(sample, num_samples, p);
}
@@ -113,7 +122,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
return r;
#else
- /* compute sobol sequence value using direction vectors */
+ /* Compute sobol sequence value using direction vectors. */
uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
@@ -130,24 +139,33 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
#endif
}
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension,
+ float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
- /* correlated multi-jittered */
+ /* Correlated multi-jitter. */
int p = *rng + dimension;
cmj_sample_2D(sample, num_samples, p, fx, fy);
}
else
#endif
{
- /* sobol */
+ /* Sobol. */
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
}
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ int sample, int num_samples,
+ RNG *rng,
+ int x, int y,
+ float *fx, float *fy)
{
#ifdef __SOBOL_FULL_SCREEN__
uint px, py;
@@ -182,29 +200,43 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta
#endif
}
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ RNG rng)
{
/* nothing to do */
}
-#else
+#else /* __SOBOL__ */
/* Linear Congruential Generator */
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension)
{
/* implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return (float)*rng * (1.0f/(float)0xFFFFFFFF);
}
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension,
+ float *fx, float *fy)
{
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
-ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device void path_rng_init(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ int sample, int num_samples,
+ RNG *rng,
+ int x, int y,
+ float *fx, float *fy)
{
/* load state */
*rng = *rng_state;
@@ -220,13 +252,15 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int
}
}
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ RNG rng)
{
/* store state for next sample */
*rng_state = rng;
}
-#endif
+#endif /* __SOBOL__ */
/* Linear Congruential Generator */
@@ -257,49 +291,108 @@ ccl_device uint lcg_init(uint seed)
* dimension to avoid using the same sequence twice.
*
* For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly. */
+ * in a sequence and offset accordingly.
+ */
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension)
{
- return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
+ return path_rng_1D(kg,
+ rng,
+ state->sample, state->num_samples,
+ state->rng_offset + dimension);
}
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension)
{
- /* the rng_offset is not increased for transparent bounces. if we do then
+ /* The rng_offset is not increased for transparent bounces. if we do then
* fully transparent objects can become subtly visible by the different
* sampling patterns used where the transparent object is.
*
* however for some random numbers that will determine if we next bounce
* is transparent we do need to increase the offset to avoid always making
- * the same decision */
- int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
- return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+ * the same decision. */
+ const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg,
+ rng,
+ state->sample, state->num_samples,
+ rng_offset + dimension);
}
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension,
+ float *fx, float *fy)
{
- path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
+ path_rng_2D(kg,
+ rng,
+ state->sample, state->num_samples,
+ state->rng_offset + dimension,
+ fx, fy);
}
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension)
{
- return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
+ return path_rng_1D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ state->rng_offset + dimension);
}
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension)
{
- int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
- return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+ const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ rng_offset + dimension);
}
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension,
+ float *fx, float *fy)
{
- path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
+ path_rng_2D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ state->rng_offset + dimension,
+ fx, fy);
}
-/* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state)
+/* Utitility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
@@ -307,15 +400,27 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG
return 0.0f;
}
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+ return path_branched_rng_1D_for_decision(kg,
+ rng,
+ state,
+ branch,
+ num_branches,
+ PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+ int branch,
+ int num_branches)
{
/* path is splitting into a branch, adjust so that each branch
* still gets a unique sample from the same sequence */
@@ -324,14 +429,17 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int br
state->num_samples = state->num_samples*num_branches;
}
-ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng,
+ int rng_offset,
+ int sample,
+ uint scramble)
{
return lcg_init(*rng + rng_offset + sample*scramble);
}
ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
{
- /* implicit mod 2^32 */
+ /* Implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return (float)*rng * (1.0f/(float)0xFFFFFFFF);
}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index db6f839d9ed..fab5946970d 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -379,7 +379,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
float3 *shadow)
{
bool blocked, is_transparent_isect;
- if (skip_object == OBJECT_NONE) {
+ if(skip_object == OBJECT_NONE) {
blocked = scene_intersect(kg,
*ray,
PATH_RAY_SHADOW_OPAQUE,
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index f75e9337bdb..6475d4b66fd 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
}
/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
{
sd->flag &= ~SD_CLOSURE_FLAGS;
sd->randb_closure = 0.0f;
@@ -148,15 +148,35 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei
sd->num_closure_extra = 0;
if(hit) {
- DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
- if(bsdf) {
- bsdf->N = N;
- sd->flag |= bsdf_diffuse_setup(bsdf);
-
- /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
- * can recognize it as not being a regular diffuse closure */
- bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ Bssrdf *bssrdf = (Bssrdf *)sc;
+#ifdef __PRINCIPLED__
+ if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = bssrdf->roughness;
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular Disney principled diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+ }
+ }
+ else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) ||
+ CLOSURE_IS_BSSRDF(bssrdf->type))
+#endif /* __PRINCIPLED__ */
+ {
+ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ sd->flag |= bsdf_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ }
}
}
}
@@ -379,6 +399,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
#else
Ray *ray = &ss_isect->ray;
#endif
+
+ /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+ kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
+
/* Setup new shading point. */
shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
@@ -388,12 +414,11 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
/* Setup diffuse BSDF. */
- subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+ subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
}
-#ifndef __SPLIT_KERNEL__
/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -454,6 +479,10 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
if(ss_isect.num_hits > 0) {
float3 origP = sd->P;
+ /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+ kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
/* setup new shading point */
shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
@@ -479,9 +508,8 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
/* setup diffuse bsdf */
- subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+ subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
}
-#endif /* ! __SPLIT_KERNEL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..aa5b32803a5 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -82,10 +82,10 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions)
# if __CUDA_ARCH__ < 300
/* full-float image */
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
@@ -93,91 +93,93 @@ KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
# else
/* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 623f3728c69..e6a62c42a38 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
#endif
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+# define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+# define SHADER_SORT_LOCAL_SIZE 32
+#else
+# define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
/* device capabilities */
#ifdef __KERNEL_CPU__
# ifdef __KERNEL_SSE2__
@@ -71,21 +83,18 @@ CCL_NAMESPACE_BEGIN
# endif
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# ifndef __SPLIT_KERNEL__
-# define __BRANCHED_PATH__
-# endif
+# define __BRANCHED_PATH__
# ifdef WITH_OSL
# define __OSL__
# endif
+# define __PRINCIPLED__
# define __SUBSURFACE__
# define __CMJ__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
-# ifndef __SPLIT_KERNEL__
-# define __VOLUME_DECOUPLED__
-# define __VOLUME_RECORD_ALL__
-# endif
+# define __VOLUME_DECOUPLED__
+# define __VOLUME_RECORD_ALL__
#endif /* __KERNEL_CPU__ */
#ifdef __KERNEL_CUDA__
@@ -94,10 +103,11 @@ CCL_NAMESPACE_BEGIN
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __SHADOW_RECORD_ALL__
+# define __CMJ__
# ifndef __SPLIT_KERNEL__
# define __BRANCHED_PATH__
-# define __CMJ__
# endif
#endif /* __KERNEL_CUDA__ */
@@ -109,43 +119,44 @@ CCL_NAMESPACE_BEGIN
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
+# define __CMJ__
+# define __BRANCHED_PATH__
# endif /* __KERNEL_OPENCL_NVIDIA__ */
# ifdef __KERNEL_OPENCL_APPLE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
+# define __PRINCIPLED__
+# define __CMJ__
/* TODO(sergey): Currently experimental section is ignored here,
* this is because megakernel in device_opencl does not support
* custom cflags depending on the scene features.
*/
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
-# endif /* __KERNEL_OPENCL_NVIDIA__ */
+# endif /* __KERNEL_OPENCL_APPLE__ */
# ifdef __KERNEL_OPENCL_AMD__
# define __CL_USE_NATIVE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
+# define __CMJ__
+# define __BRANCHED_PATH__
# endif /* __KERNEL_OPENCL_AMD__ */
# ifdef __KERNEL_OPENCL_INTEL_CPU__
# define __CL_USE_NATIVE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
+# define __PRINCIPLED__
+# define __CMJ__
# endif /* __KERNEL_OPENCL_INTEL_CPU__ */
#endif /* __KERNEL_OPENCL__ */
@@ -165,6 +176,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
+#define __DENOISING_FEATURES__
+
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@@ -220,7 +233,13 @@ CCL_NAMESPACE_BEGIN
# undef __TRANSPARENT_SHADOWS__
#endif
#ifdef __NO_SHADOW_TRICKS__
-#undef __SHADOW_TRICKS__
+# undef __SHADOW_TRICKS__
+#endif
+#ifdef __NO_PRINCIPLED__
+# undef __PRINCIPLED__
+#endif
+#ifdef __NO_DENOISING__
+# undef __DENOISING_FEATURES__
#endif
/* Random Numbers */
@@ -303,31 +322,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
- PATH_RAY_CAMERA = 1,
- PATH_RAY_REFLECT = 2,
- PATH_RAY_TRANSMIT = 4,
- PATH_RAY_DIFFUSE = 8,
- PATH_RAY_GLOSSY = 16,
- PATH_RAY_SINGULAR = 32,
- PATH_RAY_TRANSPARENT = 64,
-
- PATH_RAY_SHADOW_OPAQUE = 128,
- PATH_RAY_SHADOW_TRANSPARENT = 256,
+ PATH_RAY_CAMERA = (1 << 0),
+ PATH_RAY_REFLECT = (1 << 1),
+ PATH_RAY_TRANSMIT = (1 << 2),
+ PATH_RAY_DIFFUSE = (1 << 3),
+ PATH_RAY_GLOSSY = (1 << 4),
+ PATH_RAY_SINGULAR = (1 << 5),
+ PATH_RAY_TRANSPARENT = (1 << 6),
+
+ PATH_RAY_SHADOW_OPAQUE = (1 << 7),
+ PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
- PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+ PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = 2048,
+ PATH_RAY_NODE_UNALIGNED = (1 << 11),
- PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+ PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
- PATH_RAY_MIS_SKIP = 4096,
- PATH_RAY_DIFFUSE_ANCESTOR = 8192,
- PATH_RAY_SINGLE_PASS_DONE = 16384,
- PATH_RAY_SHADOW_CATCHER = 32768,
- PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+ PATH_RAY_MIS_SKIP = (1 << 12),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
+ PATH_RAY_SHADOW_CATCHER = (1 << 15),
+ PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+ PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@@ -383,6 +403,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
+typedef enum DenoisingPassOffsets {
+ DENOISING_PASS_NORMAL = 0,
+ DENOISING_PASS_NORMAL_VAR = 3,
+ DENOISING_PASS_ALBEDO = 6,
+ DENOISING_PASS_ALBEDO_VAR = 9,
+ DENOISING_PASS_DEPTH = 12,
+ DENOISING_PASS_DEPTH_VAR = 13,
+ DENOISING_PASS_SHADOW_A = 14,
+ DENOISING_PASS_SHADOW_B = 17,
+ DENOISING_PASS_COLOR = 20,
+ DENOISING_PASS_COLOR_VAR = 23,
+
+ DENOISING_PASS_SIZE_BASE = 26,
+ DENOISING_PASS_SIZE_CLEAN = 3,
+} DenoisingPassOffsets;
+
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@@ -416,6 +452,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
+typedef enum DenoiseFlag {
+ DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
+ DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
+ DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
+ DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
+ DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+ DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+ DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
+ DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
+ DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
+} DenoiseFlag;
+
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@@ -469,8 +517,20 @@ typedef ccl_addr_space struct PathRadiance {
float3 path_total_shaded;
/* Color of the background on which shadow is alpha-overed. */
- float3 shadow_color;
+ float3 shadow_background_color;
+
+ /* Path radiance sum and throughput at the moment when ray hits shadow
+ * catcher object.
+ */
+ float3 shadow_radiance_sum;
+ float shadow_throughput;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ float3 denoising_normal;
+ float3 denoising_albedo;
+ float denoising_depth;
+#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -713,12 +773,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
- float sample_weight \
+ float sample_weight; \
+ float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
- float data[14]; /* pad to 80 bytes */
+ float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@@ -949,6 +1010,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
+#ifdef __DENOISING_FEATURES__
+ float denoising_feature_weight;
+#endif /* __DENOISING_FEATURES__ */
+
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@@ -1126,6 +1191,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
+ int pass_denoising_data;
+ int pass_denoising_clean;
+ int denoising_flags;
+ int pad;
+
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;
@@ -1298,7 +1368,6 @@ typedef ccl_addr_space struct DebugData {
* Queue 3 - Shadow ray cast kernel - AO
* Queeu 4 - Shadow ray cast kernel - direct lighting
*/
-#define NUM_QUEUES 4
/* Queue names */
enum QueueNumber {
@@ -1311,22 +1380,42 @@ enum QueueNumber {
* 3. Rays to be regenerated
* are enqueued here.
*/
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
/* All rays for which a shadow ray should be cast to determine radiance
* contribution for AO are enqueued here.
*/
- QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+ QUEUE_SHADOW_RAY_CAST_AO_RAYS,
/* All rays for which a shadow ray should be cast to determine radiance
* contributing for direct lighting are enqueued here.
*/
- QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+ /* Rays sorted according to shader->id */
+ QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+ /* All rays moving to next iteration of the indirect loop for light */
+ QUEUE_LIGHT_INDIRECT_ITER,
+ /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+ QUEUE_INACTIVE_RAYS,
+# ifdef __VOLUME__
+ /* All rays moving to next iteration of the indirect loop for volumes */
+ QUEUE_VOLUME_INDIRECT_ITER,
+# endif
+# ifdef __SUBSURFACE__
+ /* All rays moving to next iteration of the indirect loop for subsurface */
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+# endif
+#endif /* __BRANCHED_PATH__ */
+
+ NUM_QUEUES
};
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
enum RayState {
RAY_INVALID = 0,
/* Denotes ray is actively involved in path-iteration. */
@@ -1341,14 +1430,25 @@ enum RayState {
RAY_TO_REGENERATE,
/* Denotes ray has been regenerated */
RAY_REGENERATED,
- /* Flag's ray has to execute shadow blocked function in AO part */
- RAY_SHADOW_RAY_CAST_AO = 16,
- /* Flag's ray has to execute shadow blocked function in direct lighting part. */
- RAY_SHADOW_RAY_CAST_DL = 32,
+ /* Denotes ray is moving to next iteration of the branched indirect loop */
+ RAY_LIGHT_INDIRECT_NEXT_ITER,
+ RAY_VOLUME_INDIRECT_NEXT_ITER,
+ RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+ /* Ray flags */
+
+ /* Flags to denote that the ray is currently evaluating the branched indirect loop */
+ RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+ RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+ RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+ RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+ /* Ray is evaluating an iteration of an indirect loop for another thread */
+ RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
};
#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 9c0878249d4..1e472aaf51a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -660,6 +660,7 @@ typedef struct VolumeSegment {
* but the entire segment is needed to do always scattering, rather than probabilistically
* hitting or missing the volume. if we don't know the transmittance at the end of the
* volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
{
@@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
#endif
}
}
+#endif /* __VOLUME_DECOUPLED__ */
/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
* marching.
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..2ed713299fd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleV,
+ float *sampleVV,
+ float *bufferV,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean,
+ float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int *rect,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weight_image,
+ float *variance,
+ float *difference_image,
+ int* rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+ float *out_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+ float *out_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *difference_image,
+ float *image,
+ float *out_image,
+ float *accum_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *difference_image,
+ float *buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+ float *accum_image,
+ int* rect,
+ int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..8dc1a8d583c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+ kernel_filter_divide_shadow(sample, tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean, float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+ kernel_filter_get_feature(sample, tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int *rect,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* prefilter_rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
+ kernel_filter_construct_transform(buffer,
+ x, y,
+ load_int4(prefilter_rect),
+ pass_stride,
+ transform,
+ rank,
+ radius,
+ pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weight_image,
+ float *variance,
+ float *difference_image,
+ int *rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+ kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+ float *out_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+ kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+ float *out_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+ kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *difference_image,
+ float *image,
+ float *out_image,
+ float *accum_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+ kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *difference_image,
+ float *buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+ kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+ float *accum_image,
+ int *rect,
+ int w)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+ kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+ XtWX += storage_ofs*XTWX_SIZE;
+ XtWY += storage_ofs*XTWY_SIZE;
+ rank += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 16992c681e6..998619ac897 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -95,9 +95,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_float4")) {
texture_image_float4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_float4_"));
- int array_index = id;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_float4_images.size()) {
+ kg->texture_float4_images.resize(array_index+1);
+ }
tex = &kg->texture_float4_images[array_index];
}
@@ -111,9 +114,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_float")) {
texture_image_float *tex = NULL;
int id = atoi(name + strlen("__tex_image_float_"));
- int array_index = id - TEX_START_FLOAT_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_float_images.size()) {
+ kg->texture_float_images.resize(array_index+1);
+ }
tex = &kg->texture_float_images[array_index];
}
@@ -127,9 +133,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_byte4")) {
texture_image_uchar4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_byte4_"));
- int array_index = id - TEX_START_BYTE4_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_byte4_images.size()) {
+ kg->texture_byte4_images.resize(array_index+1);
+ }
tex = &kg->texture_byte4_images[array_index];
}
@@ -143,9 +152,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_byte")) {
texture_image_uchar *tex = NULL;
int id = atoi(name + strlen("__tex_image_byte_"));
- int array_index = id - TEX_START_BYTE_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_byte_images.size()) {
+ kg->texture_byte_images.resize(array_index+1);
+ }
tex = &kg->texture_byte_images[array_index];
}
@@ -159,9 +171,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_half4")) {
texture_image_half4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_half4_"));
- int array_index = id - TEX_START_HALF4_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_half4_images.size()) {
+ kg->texture_half4_images.resize(array_index+1);
+ }
tex = &kg->texture_half4_images[array_index];
}
@@ -175,9 +190,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_half")) {
texture_image_half *tex = NULL;
int id = atoi(name + strlen("__tex_image_half_"));
- int array_index = id - TEX_START_HALF_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_half_images.size()) {
+ kg->texture_half_images.resize(array_index+1);
+ }
tex = &kg->texture_half_images[array_index];
}
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 896b80d783e..c8938534fe8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -77,16 +77,17 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index af68907a5c2..f6bb4c25012 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -23,51 +23,59 @@ CCL_NAMESPACE_BEGIN
ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y);
- else
- return kg->texture_float4_images[tex].interp(x, y);
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y);
+ }
}
ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z);
- else
- return kg->texture_float4_images[tex].interp_3d(x, y, z);
-
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ }
}
ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation);
- else
- return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation);
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 148b2eef568..d4315ee5ec4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -22,38 +22,50 @@
#include "kernel/kernel_compat_cpu.h"
-#ifndef __SPLIT_KERNEL__
-# include "kernel/kernel_math.h"
-# include "kernel/kernel_types.h"
-
-# include "kernel/split/kernel_split_data.h"
-# include "kernel/kernel_globals.h"
-
-# include "kernel/kernels/cpu/kernel_cpu_image.h"
-# include "kernel/kernel_film.h"
-# include "kernel/kernel_path.h"
-# include "kernel/kernel_path_branched.h"
-# include "kernel/kernel_bake.h"
+#ifndef KERNEL_STUB
+# ifndef __SPLIT_KERNEL__
+# include "kernel/kernel_math.h"
+# include "kernel/kernel_types.h"
+
+# include "kernel/split/kernel_split_data.h"
+# include "kernel/kernel_globals.h"
+
+# include "kernel/kernels/cpu/kernel_cpu_image.h"
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_path.h"
+# include "kernel/kernel_path_branched.h"
+# include "kernel/kernel_bake.h"
+# else
+# include "kernel/split/kernel_split_common.h"
+
+# include "kernel/split/kernel_data_init.h"
+# include "kernel/split/kernel_path_init.h"
+# include "kernel/split/kernel_scene_intersect.h"
+# include "kernel/split/kernel_lamp_emission.h"
+# include "kernel/split/kernel_do_volume.h"
+# include "kernel/split/kernel_queue_enqueue.h"
+# include "kernel/split/kernel_indirect_background.h"
+# include "kernel/split/kernel_shader_setup.h"
+# include "kernel/split/kernel_shader_sort.h"
+# include "kernel/split/kernel_shader_eval.h"
+# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+# include "kernel/split/kernel_subsurface_scatter.h"
+# include "kernel/split/kernel_direct_lighting.h"
+# include "kernel/split/kernel_shadow_blocked_ao.h"
+# include "kernel/split/kernel_shadow_blocked_dl.h"
+# include "kernel/split/kernel_enqueue_inactive.h"
+# include "kernel/split/kernel_next_iteration_setup.h"
+# include "kernel/split/kernel_indirect_subsurface.h"
+# include "kernel/split/kernel_buffer_update.h"
+# endif /* __SPLIT_KERNEL__ */
#else
-# include "kernel/split/kernel_split_common.h"
-
-# include "kernel/split/kernel_data_init.h"
-# include "kernel/split/kernel_path_init.h"
-# include "kernel/split/kernel_scene_intersect.h"
-# include "kernel/split/kernel_lamp_emission.h"
-# include "kernel/split/kernel_do_volume.h"
-# include "kernel/split/kernel_queue_enqueue.h"
-# include "kernel/split/kernel_indirect_background.h"
-# include "kernel/split/kernel_shader_eval.h"
-# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-# include "kernel/split/kernel_subsurface_scatter.h"
-# include "kernel/split/kernel_direct_lighting.h"
-# include "kernel/split/kernel_shadow_blocked_ao.h"
-# include "kernel/split/kernel_shadow_blocked_dl.h"
-# include "kernel/split/kernel_next_iteration_setup.h"
-# include "kernel/split/kernel_indirect_subsurface.h"
-# include "kernel/split/kernel_buffer_update.h"
-#endif
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+
+# ifdef __SPLIT_KERNEL__
+# include "kernel/split/kernel_data_init.h"
+# endif /* __SPLIT_KERNEL__ */
+#endif /* KERNEL_STUB */
CCL_NAMESPACE_BEGIN
@@ -69,7 +81,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -80,10 +95,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
-#endif
+# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
+#endif /* KERNEL_STUB */
}
/* Film */
@@ -96,6 +112,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -103,6 +122,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -113,6 +133,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -120,6 +143,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -134,9 +158,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader);
+#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -145,7 +172,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
-#endif
+# endif
}
else {
kernel_shader_evaluate(kg,
@@ -156,24 +183,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
+#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+
+# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+#else
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
ccl_local type locals; \
kernel_##name(kg, &locals); \
}
+#endif /* KERNEL_STUB */
DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
@@ -181,49 +223,22 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
- REGISTER(path_trace);
- REGISTER(convert_to_byte);
- REGISTER(convert_to_half_float);
- REGISTER(shader);
-
- REGISTER(data_init);
- REGISTER(path_init);
- REGISTER(scene_intersect);
- REGISTER(lamp_emission);
- REGISTER(do_volume);
- REGISTER(queue_enqueue);
- REGISTER(indirect_background);
- REGISTER(shader_eval);
- REGISTER(holdout_emission_blurring_pathtermination_ao);
- REGISTER(subsurface_scatter);
- REGISTER(direct_lighting);
- REGISTER(shadow_blocked_ao);
- REGISTER(shadow_blocked_dl);
- REGISTER(next_iteration_setup);
- REGISTER(indirect_subsurface);
- REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
#endif /* __SPLIT_KERNEL__ */
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..009c3fde9d5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+ TilesInfo *tiles,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ float *mean,
+ float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+ float *variance,
+ float *depth,
+ float *output,
+ int4 prefilter_rect,
+ int pass_stride)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ float *transform, int *rank,
+ int4 filter_area, int4 rect,
+ int radius, float pca_threshold,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int *l_rank = rank + y*filter_area.z + x;
+ float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+ const float *ccl_restrict weight_image,
+ const float *ccl_restrict variance_image,
+ float *difference_image,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict image,
+ float *out_image, float *accum_image,
+ int4 rect, int w,
+ int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict buffer,
+ float const* __restrict__ transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+ int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+ float *buffer, int *rank,
+ float *XtWX, float3 *XtWY,
+ int4 filter_area, int4 buffer_params,
+ int sample)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index a679eff8409..628891b1458 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -31,12 +31,15 @@
#include "kernel/split/kernel_do_volume.h"
#include "kernel/split/kernel_queue_enqueue.h"
#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
#include "kernel/split/kernel_shader_eval.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
#include "kernel/split/kernel_subsurface_scatter.h"
#include "kernel/split/kernel_direct_lighting.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
#include "kernel/split/kernel_next_iteration_setup.h"
#include "kernel/split/kernel_indirect_subsurface.h"
#include "kernel/split/kernel_buffer_update.h"
@@ -108,12 +111,15 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..ba53ba4b26f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int4 prefilter_rect,
+ int pass_stride)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+ }
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 prefilter_rect,
+ int r)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int4 filter_area,
+ int4 rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ ccl_global int *l_rank = rank + y*filter_area.z + x;
+ ccl_global float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ ccl_global float *difference_image,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict image,
+ ccl_global float *out_image,
+ ccl_global float *accum_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+ const ccl_global float *ccl_restrict accum_image,
+ int4 rect,
+ int w)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+ int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+ int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+ int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 filter_area,
+ int4 buffer_params,
+ int sample)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+ ccl_global float *buffer_1,
+ ccl_global float *buffer_2,
+ ccl_global float *buffer_3,
+ ccl_global float *buffer_4,
+ ccl_global float *buffer_5,
+ ccl_global float *buffer_6,
+ ccl_global float *buffer_7,
+ ccl_global float *buffer_8,
+ ccl_global float *buffer_9)
+{
+ if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+ tiles->buffers[0] = buffer_1;
+ tiles->buffers[1] = buffer_2;
+ tiles->buffers[2] = buffer_3;
+ tiles->buffers[3] = buffer_4;
+ tiles->buffers[4] = buffer_5;
+ tiles->buffers[5] = buffer_6;
+ tiles->buffers[6] = buffer_7;
+ tiles->buffers[7] = buffer_8;
+ tiles->buffers[8] = buffer_9;
+ }
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_buffer_update.h"
-__kernel void kernel_ocl_path_trace_buffer_update(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_direct_lighting.h"
-__kernel void kernel_ocl_path_trace_direct_lighting(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_do_volume.h"
-__kernel void kernel_ocl_path_trace_do_volume(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local BackgroundAOLocals locals;
- kernel_holdout_emission_blurring_pathtermination_ao(
- (KernelGlobals*)kg,
- &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_background.h"
-__kernel void kernel_ocl_path_trace_indirect_background(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
index 998ebc4c0c3..84938b889e5 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_subsurface.h"
-__kernel void kernel_ocl_path_trace_indirect_subsurface(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_subsurface((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 822d2287715..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_lamp_emission.h"
-__kernel void kernel_ocl_path_trace_lamp_emission(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_lamp_emission((KernelGlobals*)kg);
-}
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d207253a40..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_next_iteration_setup.h"
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
index bd9aa9538c8..fa210e747c0 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_path_init.h"
-__kernel void kernel_ocl_path_trace_path_init(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_path_init((KernelGlobals*)kg);
-}
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 9be154e3d75..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_queue_enqueue.h"
-__kernel void kernel_ocl_path_trace_queue_enqueue(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local QueueEnqueueLocals locals;
- kernel_queue_enqueue((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index eb4fb4d153a..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_scene_intersect.h"
-__kernel void kernel_ocl_path_trace_scene_intersect(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_scene_intersect((KernelGlobals*)kg);
-}
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 6baee460986..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -18,10 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_eval.h"
-__kernel void kernel_ocl_path_trace_shader_eval(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..8c36100f762
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..bcacaa4a054
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
index 6a8ef81b32a..8de250a375c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_ao((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
index b255cc5ef8b..29da77022ed 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_dl((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 732cda30115..651addb02f4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -23,12 +23,15 @@
#include "kernel/kernels/opencl/kernel_do_volume.cl"
#include "kernel/kernels/opencl/kernel_indirect_background.cl"
#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
#include "kernel/kernels/opencl/kernel_shader_eval.cl"
#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
#include "kernel/kernels/opencl/kernel_buffer_update.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..f1e914a70d4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+ ccl_global char *kg_global,
+ ccl_constant KernelData *data,
+
+ ccl_global void *split_data_buffer,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name,
+#include "kernel/kernel_textures.h"
+
+ ccl_global int *queue_index,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pools,
+ ccl_global float *buffer
+ )
+{
+#ifdef LOCALS_TYPE
+ ccl_local LOCALS_TYPE locals;
+#endif
+
+ KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+ if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+ kg->data = data;
+
+ kernel_split_params.rng_state = rng_state;
+ kernel_split_params.queue_index = queue_index;
+ kernel_split_params.use_queues_flag = use_queues_flag;
+ kernel_split_params.work_pools = work_pools;
+ kernel_split_params.buffer = buffer;
+
+ split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+ kg->name = name;
+#include "kernel/kernel_textures.h"
+ }
+
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+ kg
+#ifdef LOCALS_TYPE
+ , &locals
+#endif
+ );
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 7a1838e485f..2b3be38df84 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -18,10 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_subsurface_scatter.h"
-__kernel void kernel_ocl_path_trace_subsurface_scatter(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 95beea01d25..27a96720c1e 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -39,7 +39,9 @@
#include "kernel/kernel_montecarlo.h"
#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
#include "kernel/closure/bssrdf.h"
CCL_NAMESPACE_BEGIN
@@ -78,6 +80,7 @@ public:
bssrdf->albedo = albedo.x;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
@@ -89,6 +92,7 @@ public:
bssrdf->albedo = albedo.y;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
@@ -100,6 +104,7 @@ public:
bssrdf->albedo = albedo.z;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
}
@@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params()
CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure)
+/* Disney principled */
+
+class PrincipledBSSRDFClosure : public CBSSRDFClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+};
+
+ClosureParam *closure_bssrdf_principled_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius),
+ CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur),
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo),
+ CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness),
+ CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure)
+ };
+ return params;
+}
+
+CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure)
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index f44714c2150..14c5c1c3db5 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -60,6 +60,8 @@
#include "kernel/closure/bsdf_ashikhmin_shirley.h"
#include "kernel/closure/bsdf_toon.h"
#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
#include "kernel/closure/volume.h"
CCL_NAMESPACE_BEGIN
@@ -154,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -162,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+ CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -176,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE)
+ CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N),
+ CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
+BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
+
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE)
+ CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+
+/* DISNEY PRINCIPLED CLEARCOAT */
+class PrincipledClearcoatClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float clearcoat, clearcoat_roughness;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+
+ bsdf->ior = 1.5f;
+
+ bsdf->alpha_x = clearcoat_roughness;
+ bsdf->alpha_y = clearcoat_roughness;
+
+ bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+ bsdf->extra->clearcoat = clearcoat;
+
+ return bsdf;
+ }
+
+ return NULL;
+ }
+
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_principled_clearcoat_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
+ CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
+ CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
+ CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
+
+
/* Registration */
static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare)
@@ -215,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare);
register_closure(ss, "microfacet_multi_ggx_aniso", id++,
closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare);
+ register_closure(ss, "microfacet_ggx_fresnel", id++,
+ closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare);
+ register_closure(ss, "microfacet_ggx_aniso_fresnel", id++,
+ closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
register_closure(ss, "microfacet_beckmann", id++,
bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
register_closure(ss, "microfacet_beckmann_aniso", id++,
@@ -229,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
register_closure(ss, "glossy_toon", id++,
bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
+ register_closure(ss, "principled_diffuse", id++,
+ bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare);
+ register_closure(ss, "principled_sheen", id++,
+ bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+ register_closure(ss, "principled_clearcoat", id++,
+ closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare);
register_closure(ss, "emission", id++,
closure_emission_params(), closure_emission_prepare);
@@ -248,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
register_closure(ss, "bssrdf_burley", id++,
closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare);
+ register_closure(ss, "bssrdf_principled", id++,
+ closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare);
register_closure(ss, "hair_reflection", id++,
bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
@@ -278,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
return false;
}
+
+/* GGX closures with Fresnel */
+
+class MicrofacetFresnelClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float3 color;
+ float3 cspec0;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ /* Technically, the MultiGGX Glass closure may also transmit. However,
+ * since this is set statically and only used for caustic flags, this
+ * is probably as good as it gets. */
+ if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+ bsdf->extra->color = color;
+ bsdf->extra->cspec0 = cspec0;
+ return bsdf;
+ }
+ }
+
+ return NULL;
+ }
+};
+
+class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
+
+class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure);
+
+
/* Multiscattering GGX closures */
class MicrofacetMultiClosure : public CBSDFClosure {
@@ -287,7 +444,7 @@ public:
MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
{
- /* Technically, the MultiGGX Glass closure may also transmit. However,
+ /* Technically, the MultiGGX closure may also transmit. However,
* since this is set statically and only used for caustic flags, this
* is probably as good as it gets. */
if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
@@ -375,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
}
CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
+
+/* Multiscattering GGX closures with Fresnel */
+
+class MicrofacetMultiFresnelClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float3 color;
+ float3 cspec0;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ /* Technically, the MultiGGX closure may also transmit. However,
+ * since this is set statically and only used for caustic flags, this
+ * is probably as good as it gets. */
+ if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+ bsdf->extra->color = color;
+ bsdf->extra->cspec0 = cspec0;
+ return bsdf;
+ }
+ }
+
+ return NULL;
+ }
+};
+
+class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure);
+
+class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure);
+
+class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {}
+
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure);
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 929cf00a7e6..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params();
OSL::ClosureParam *closure_bssrdf_cubic_params();
OSL::ClosureParam *closure_bssrdf_gaussian_params();
OSL::ClosureParam *closure_bssrdf_burley_params();
+OSL::ClosureParam *closure_bssrdf_principled_params();
OSL::ClosureParam *closure_henyey_greenstein_volume_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
void closure_background_prepare(OSL::RendererServices *, int id, void *data);
@@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data
void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data);
void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
#define CCLOSURE_PREPARE(name, classname) \
void name(RendererServices *, int id, void *data) \
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index b767c60c617..1535496c73d 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
TypeDesc type, ustring name, void *val)
{
- if(sg->renderstate == NULL)
+ if(sg == NULL || sg->renderstate == NULL)
return false;
ShaderData *sd = (ShaderData *)(sg->renderstate);
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b43f8402d42..1a8ed4c884a 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -81,13 +81,15 @@ set(SRC_OSL
node_wireframe.osl
node_hair_bsdf.osl
node_uv_map.osl
+ node_principled_bsdf.osl
node_rgb_to_bw.osl
)
set(SRC_OSL_HEADERS
- node_texture.h
node_color.h
node_fresnel.h
+ node_ramp_util.h
+ node_texture.h
stdosl.h
oslutil.h
)
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
new file mode 100644
index 00000000000..6870d479af3
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+#include "node_fresnel.h"
+
+shader node_principled_bsdf(
+ string distribution = "Multiscatter GGX",
+ color BaseColor = color(0.8, 0.8, 0.8),
+ float Subsurface = 0.0,
+ vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
+ color SubsurfaceColor = color(0.7, 0.1, 0.1),
+ float Metallic = 0.0,
+ float Specular = 0.5,
+ float SpecularTint = 0.0,
+ float Roughness = 0.5,
+ float Anisotropic = 0.0,
+ float AnisotropicRotation = 0.0,
+ float Sheen = 0.0,
+ float SheenTint = 0.5,
+ float Clearcoat = 0.0,
+ float ClearcoatRoughness = 0.03,
+ float IOR = 1.45,
+ float Transmission = 0.0,
+ float TransmissionRoughness = 0.0,
+ normal Normal = N,
+ normal ClearcoatNormal = N,
+ normal Tangent = normalize(dPdu),
+ output closure color BSDF = 0)
+{
+ float f = max(IOR, 1e-5);
+ float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+ float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+ float specular_weight = (1.0 - final_transmission);
+
+ vector T = Tangent;
+
+ float m_cdlum = luminance(BaseColor);
+ color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat
+
+ /* rotate tangent */
+ if (AnisotropicRotation != 0.0)
+ T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
+
+ if (diffuse_weight > 1e-5) {
+ if (Subsurface > 1e-5) {
+ color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+ BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness);
+ } else {
+ BSDF = BaseColor * principled_diffuse(Normal, Roughness);
+ }
+
+ if (Sheen > 1e-5) {
+ color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint;
+
+ BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal);
+ }
+
+ BSDF = BSDF * diffuse_weight;
+ }
+
+ if (specular_weight > 1e-5) {
+ float aspect = sqrt(1.0 - Anisotropic * 0.9);
+ float r2 = Roughness * Roughness;
+
+ float alpha_x = r2 / aspect;
+ float alpha_y = r2 * aspect;
+
+ color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
+
+ color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic;
+
+ if (distribution == "GGX" || Roughness <= 0.075) {
+ BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+ } else {
+ BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+ }
+ }
+
+ if (final_transmission > 1e-5) {
+ color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
+ float eta = backfacing() ? 1.0 / f : f;
+
+ if (distribution == "GGX" || Roughness <= 5e-2) {
+ float cosNO = dot(Normal, I);
+ float Fr = fresnel_dielectric_cos(cosNO, eta);
+
+ float refl_roughness = Roughness;
+ if (Roughness <= 1e-2)
+ refl_roughness = 0.0;
+
+ float transmission_roughness = refl_roughness;
+ if (distribution == "GGX")
+ transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
+
+ BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+ (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
+ } else {
+ BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+ }
+ }
+
+ if (Clearcoat > 1e-5) {
+ BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
+ }
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index a8dda8a12c9..c91d2918687 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
closure color microfacet_beckmann(normal N, float ab) BUILTIN;
closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
@@ -539,11 +544,15 @@ closure color emission() BUILTIN;
closure color background() BUILTIN;
closure color holdout() BUILTIN;
closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
// BSSRDF
closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN;
closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN;
+closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN;
// Hair
closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..e2762a85fc8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+ branched_state->name = kernel_split_state.name[ray_index];
+
+ BRANCHED_STORE(path_state);
+ BRANCHED_STORE(throughput);
+ BRANCHED_STORE(ray);
+ BRANCHED_STORE(sd);
+ BRANCHED_STORE(isect);
+ BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+ /* set loop counters to intial position */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* restore state */
+#define BRANCHED_RESTORE(name) \
+ kernel_split_state.name[ray_index] = branched_state->name;
+
+ BRANCHED_RESTORE(path_state);
+ BRANCHED_RESTORE(throughput);
+ BRANCHED_RESTORE(ray);
+ BRANCHED_RESTORE(sd);
+ BRANCHED_RESTORE(isect);
+ BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+ /* leave indirect loop */
+ REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+ if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+ return false;
+ }
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+ SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+ kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+ kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+ kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+ path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+ inactive_L->direct_throughput = L->direct_throughput;
+ path_radiance_copy_indirect(inactive_L, L);
+
+ ray_state[inactive_ray] = RAY_REGENERATED;
+ ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+ ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+ atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+ return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+ int ray_index,
+ float num_samples_adjust,
+ ShaderData *saved_sd,
+ bool reset_path_state,
+ bool wait_for_shared)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = saved_sd;
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ float3 throughput = branched_state->throughput;
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(ps->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
+ for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF(sc->type))
+ continue;
+ /* transparency is not handled here, but in outer loop */
+ if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ continue;
+
+ int num_samples;
+
+ if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+ num_samples = kernel_data.integrator.diffuse_samples;
+ else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+ num_samples = 1;
+ else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+ num_samples = kernel_data.integrator.glossy_samples;
+ else
+ num_samples = kernel_data.integrator.transmission_samples;
+
+ num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+ float num_samples_inv = num_samples_adjust/num_samples;
+ RNG bsdf_rng = cmj_hash(rng, i);
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ if(reset_path_state) {
+ *ps = branched_state->path_state;
+ }
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = throughput;
+
+ ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+ if(!kernel_branched_path_surface_bounce(kg,
+ &bsdf_rng,
+ sd,
+ sc,
+ j,
+ num_samples,
+ tp,
+ ps,
+ L,
+ bsdf_ray,
+ sum_sample_weight))
+ {
+ continue;
+ }
+
+ /* update state for next iteration */
+ branched_state->next_closure = i;
+ branched_state->next_sample = j+1;
+ branched_state->num_samples = num_samples;
+
+ /* start the indirect path */
+ *tp *= num_samples_inv;
+
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+
+ return true;
+ }
+
+ branched_state->next_sample = 0;
+ }
+
+ branched_state->next_closure = sd->num_closure;
+
+ if(wait_for_shared) {
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..4c1fdd2d69c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,14 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- float3 L_sum;
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, L);
- }
- kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
- float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L_rad);
+ bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 9d3d01fff75..e4545d66eff 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
unsigned int num_samples,
ccl_global float *buffer)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
#ifdef __KERNEL_OPENCL__
kg->data = data;
#endif
@@ -105,21 +109,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
/* Initialize queue data and queue index. */
if(thread_index < queuesize) {
- /* Initialize active ray queue. */
- kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize background and buffer update queue. */
- kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of AO queue. */
- kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of direct lighting queue. */
- kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ }
}
if(thread_index == 0) {
- Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ Queue_index[i] = 0;
+ }
+
/* The scene-intersect kernel should not use the queues very first time.
* since the queue would be empty.
*/
@@ -148,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
*(rng_state + index) = hash_int_2d(x, y);
}
}
+
+#endif /* KERENL_STUB */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index bdbf7387b95..3336c968a44 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_params.queue_size,
0);
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
/* direct lighting */
#ifdef __EMISSION__
RNG rng = kernel_split_state.rng[ray_index];
+
bool flag = (kernel_data.integrator.use_direct_light &&
(sd->flag & SD_BSDF_HAS_EVAL));
+
+# ifdef __BRANCHED_PATH__
+ if(flag && kernel_data.integrator.branched) {
+ flag = false;
+ enqueue_flag = 1;
+ }
+# endif /* __BRANCHED_PATH__ */
+
# ifdef __SHADOW_TRICKS__
if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
flag = false;
- ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
- float3 throughput = kernel_split_state.throughput[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- kernel_branched_path_surface_connect_light(kg,
- &rng,
- sd,
- emission_sd,
- state,
- throughput,
- 1.0f,
- L,
- 1);
+ enqueue_flag = 1;
}
# endif /* __SHADOW_TRICKS__ */
+
if(flag) {
/* Sample illumination from lights to find path contribution. */
float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_state.bsdf_eval[ray_index] = L_light;
kernel_split_state.is_lamp[ray_index] = is_lamp;
/* Mark ray state for next shadow kernel. */
- ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
enqueue_flag = 1;
}
}
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
#endif /* __EMISSION__ */
}
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
#ifdef __EMISSION__
/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_state.queue_data,
kernel_split_params.queue_index);
#endif
+
+#ifdef __BRANCHED_PATH__
+ /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+ * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+ */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_LIGHT_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#endif /* __BRANCHED_PATH__ */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 47d3c280831..9f8dd2392d9 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -16,6 +16,100 @@
CCL_NAMESPACE_BEGIN
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ /* GPU: no decoupled ray marching, scatter probalistically */
+ int num_samples = kernel_data.integrator.volume_samples;
+ float num_samples_inv = 1.0f/num_samples;
+
+ Ray volume_ray = branched_state->ray;
+ volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+ *ps = branched_state->path_state;
+
+ ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+ *pray = branched_state->ray;
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = branched_state->throughput * num_samples_inv;
+
+ /* branch RNG state */
+ path_state_branch(ps, j, num_samples);
+
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+# ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+ /* indirect light bounce */
+ if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+ continue;
+ }
+
+ /* start the indirect path */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = j+1;
+ branched_state->num_samples = num_samples;
+
+ /* Attempting to share too many samples is slow for volumes as it causes us to
+ * loop here more and have many calls to kernel_volume_integrate which evaluates
+ * shaders. The many expensive shader evaluations cause the work load to become
+ * unbalanced and many threads to become idle in this kernel. Limiting the
+ * number of shared samples here helps quite a lot.
+ */
+ if(branched_state->shared_sample_count < 2) {
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+ }
+
+ return true;
+ }
+# endif
+ }
+
+ branched_state->next_sample = num_samples;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ /* todo: avoid this calculation using decoupled ray marching */
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+ kernel_split_state.throughput[ray_index] = throughput;
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __VOLUME__ */
ccl_device void kernel_do_volume(KernelGlobals *kg)
{
@@ -23,37 +117,36 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
/* We will empty this queue in this kernel. */
if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+# ifdef __BRANCHED_PATH__
+ kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+# endif /* __BRANCHED_PATH__ */
}
- /* Fetch use_queues_flag. */
- char local_use_queues_flag = *kernel_split_params.use_queues_flag;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if(local_use_queues_flag) {
+
+ if(*kernel_split_params.use_queues_flag) {
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
kernel_split_params.queue_size,
1);
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
}
- if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ ccl_global char *ray_state = kernel_split_state.ray_state;
- bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
RNG rng = kernel_split_state.rng[ray_index];
ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
- ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
/* Sanitize volume stack. */
if(!hit) {
@@ -64,31 +157,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
Ray volume_ray = *ray;
volume_ray.t = (hit)? isect->t: FLT_MAX;
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+# ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+# endif /* __BRANCHED_PATH__ */
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+ {
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
- if(result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
- /* indirect light bounce */
- if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
- else
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_path_end(kg, ray_index);
+ }
+ }
+# endif /* __VOLUME_SCATTER__ */
}
-# endif
+
+# ifdef __BRANCHED_PATH__
}
+ else {
+ kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
}
+
kernel_split_state.rng[ray_index] = rng;
}
-#endif
+# ifdef __BRANCHED_PATH__
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_VOLUME_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+
+#endif /* __VOLUME__ */
}
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+ /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ char enqueue_flag = 0;
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+ enqueue_flag = 1;
+ }
+
+ enqueue_ray_index_local(ray_index,
+ QUEUE_INACTIVE_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 9fc853a84bf..fec671be016 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
* - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
* flag RAY_SHADOW_RAY_CAST_AO
*/
+
ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
KernelGlobals *kg,
ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
}
ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#ifdef __AO__
char enqueue_flag = 0;
- char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -122,14 +124,22 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if (state->flag & PATH_RAY_CAMERA) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ if(state->flag & PATH_RAY_CAMERA) {
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ state->flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+ L->shadow_background_color = indirect_background(
+ kg,
+ &kernel_split_state.sd_DL_shadow[ray_index],
+ state,
+ ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
}
else {
@@ -155,8 +165,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
}
if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
}
#endif /* __HOLDOUT__ */
@@ -164,18 +173,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- /* Holdout mask objects do not write data passes. */
- kernel_write_data_passes(kg,
- buffer,
- L,
- sd,
- sample,
- state,
- throughput);
+
+#ifdef __BRANCHED_PATH__
+ if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif /* __BRANCHED_PATH__ */
+ {
+ /* Holdout mask objects do not write data passes. */
+ kernel_write_data_passes(kg,
+ buffer,
+ L,
+ sd,
+ sample,
+ state,
+ throughput);
+ }
+
/* Blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy.
*/
- if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+ if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+ if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+ (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif /* __BRANCHED_PATH__ */
+ {
float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
if(blur_pdf < 1.0f) {
float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,85 +223,62 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate.
*/
+#ifndef __BRANCHED_PATH__
float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+ float probability = 1.0f;
+
+ if(!kernel_data.integrator.branched) {
+ probability = path_state_terminate_probability(kg, state, throughput);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+ probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+ }
+ else if(state->flag & PATH_RAY_TRANSPARENT) {
+ probability = path_state_terminate_probability(kg, state, throughput);
+ }
+#endif
if(probability == 0.0f) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
if(probability != 1.0f) {
float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
if(terminate >= probability) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
else {
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
+
+ kernel_update_denoising_features(kg, sd, state, L);
}
}
#ifdef __AO__
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
/* ambient occlusion */
- if(kernel_data.integrator.use_ambient_occlusion ||
- (sd->flag & SD_AO))
- {
- /* todo: solve correlation */
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
- float3 ao_D;
- float ao_pdf;
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray _ray;
- _ray.P = ray_offset(sd->P, sd->Ng);
- _ray.D = ao_D;
- _ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
- _ray.time = sd->time;
-#endif
- _ray.dP = sd->dP;
- _ray.dD = differential3_zero();
- kernel_split_state.ao_light_ray[ray_index] = _ray;
-
- ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
- enqueue_flag_AO_SHADOW_RAY_CAST = 1;
- }
+ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+ enqueue_flag = 1;
}
}
#endif /* __AO__ */
- kernel_split_state.rng[ray_index] = rng;
+ kernel_split_state.rng[ray_index] = rng;
#ifndef __COMPUTE_DEVICE_GPU__
}
#endif
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- &locals->queue_atomics_bg,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
#ifdef __AO__
/* Enqueue to-shadow-ray-cast rays. */
enqueue_ray_index_local(ray_index,
QUEUE_SHADOW_RAY_CAST_AO_RAYS,
- enqueue_flag_AO_SHADOW_RAY_CAST,
+ enqueue_flag,
kernel_split_params.queue_size,
&locals->queue_atomics_ao,
kernel_split_state.queue_data,
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index 8192528622e..f0ebb90f60a 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -23,7 +23,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
int ray_index;
- if(kernel_data.integrator.ao_bounces) {
+ if(kernel_data.integrator.ao_bounces != INT_MAX) {
ray_index = get_ray_index(kg, thread_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
if(state->bounce > kernel_data.integrator.ao_bounces) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
}
}
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
#ifdef __PASSES__
if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
path_radiance_accum_background(L, state, (*throughput), L_background);
#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
}
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index a56e85abeb9..82bc2f01fd7 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+ kernel_path_subsurface_accum_indirect(ss_indirect, L);
- /* Trace indirect subsurface rays by restarting the loop. this uses less
- * stack memory than invoking kernel_path_indirect.
- */
- if(ss_indirect->num_rays) {
- kernel_path_subsurface_setup_indirect(kg,
- ss_indirect,
- state,
- ray,
- L,
- throughput);
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- else {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ /* Trace indirect subsurface rays by restarting the loop. this uses less
+ * stack memory than invoking kernel_path_indirect.
+ */
+ if(ss_indirect->num_rays) {
+ kernel_path_subsurface_setup_indirect(kg,
+ ss_indirect,
+ state,
+ ray,
+ L,
+ throughput);
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
}
+#ifdef __BRANCHED_PATH__
}
+#endif
#endif /* __SUBSURFACE__ */
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 1bebc16e25b..7758e35fd32 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
* - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
* RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
*/
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+ /* continue in case of transparency */
+ *throughput *= shader_bsdf_transparency(kg, sd);
+
+ if(is_zero(*throughput)) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else {
+ /* Update Path State */
+ state->flag |= PATH_RAY_TRANSPARENT;
+ state->transparent_bounce++;
+
+ ray->P = ray_offset(sd->P, -sd->Ng);
+ ray->t -= sd->ray_length; /* clipping works through transparent */
+
+# ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD.dx = -sd->dI.dx;
+ ray->dD.dy = -sd->dI.dy;
+# endif /* __RAY_DIFFERENTIALS__ */
+
+# ifdef __VOLUME__
+ /* enter/exit volume */
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+# endif /* __VOLUME__ */
+ }
+}
+#endif /* __BRANCHED_PATH__ */
+
ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
ccl_local_param unsigned int *local_queue_atomics)
{
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
}
- char enqueue_flag = 0;
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,127 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
kernel_split_params.queue_size,
0);
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
- /* Load ShaderData structure. */
- PathRadiance *L = NULL;
- ccl_global PathState *state = NULL;
ccl_global char *ray_state = kernel_split_state.ray_state;
- /* Path radiance update for AO/Direct_lighting's shadow blocked. */
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
- IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
- {
- state = &kernel_split_state.path_state[ray_index];
- L = &kernel_split_state.path_radiance[ray_index];
- float3 _throughput = kernel_split_state.throughput[ray_index];
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
- // TODO(mai): investigate correctness here
- char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
- if(update_path_radiance) {
- path_radiance_accum_ao(L,
- _throughput,
- kernel_split_state.ao_alpha[ray_index],
- kernel_split_state.ao_bsdf[ray_index],
- shadow,
- state->bounce);
- }
- else {
- path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+ bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+ if(active) {
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ /* Compute direct lighting and next bounce. */
+ if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ kernel_split_path_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
}
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- float3 shadow = kernel_split_state.light_ray[ray_index].P;
- // TODO(mai): investigate correctness here
- char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
- BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
- if(update_path_radiance) {
- path_radiance_accum_light(L,
- _throughput,
- &L_light,
- shadow,
- 1.0f,
- state->bounce,
- kernel_split_state.is_lamp[ray_index]);
+ else {
+ kernel_split_branched_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
}
else {
- path_radiance_accum_total_light(L, _throughput, &L_light);
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
}
+#endif /* __BRANCHED_PATH__ */
+
+ kernel_split_state.rng[ray_index] = rng;
}
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
- state = &kernel_split_state.path_state[ray_index];
- L = &kernel_split_state.path_radiance[ray_index];
+ /* Enqueue RAY_UPDATE_BUFFER rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+ /* iter loop */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+ }
- /* Compute direct lighting and next bounce. */
- if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_LIGHT_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
+
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
- kernel_split_state.rng[ray_index] = rng;
}
-#ifndef __COMPUTE_DEVICE_GPU__
+# ifdef __VOLUME__
+ /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
}
-#endif
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
- /* Enqueue RAY_UPDATE_BUFFER rays. */
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
+ QUEUE_VOLUME_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+# endif /* __VOLUME__ */
+
+# ifdef __SUBSURFACE__
+ /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
kernel_split_params.queue_size,
local_queue_atomics,
kernel_split_state.queue_data,
kernel_split_params.queue_index);
+# endif /* __SUBSURFACE__ */
+#endif /* __BRANCHED_PATH__ */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index e2e841f36d3..66ce2dfb6f1 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
int queue_number = -1;
if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
}
else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 5dc94caec85..45984ca509b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
}
/* All regenerated rays become active here */
- if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+ if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else
+#endif /* __BRANCHED_PATH__ */
+ {
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ }
+ }
- if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
+ if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
return;
+ }
#ifdef __KERNEL_DEBUG__
DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 0f1696e34a0..2801b32f285 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,54 +16,61 @@
CCL_NAMESPACE_BEGIN
-/* This kernel sets up the ShaderData structure from the values computed
+/* This kernel evaluates ShaderData structure from the values computed
* by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
*/
-ccl_device void kernel_shader_eval(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
{
- /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
- if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ /* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+ int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+ if(ray_index >= queue_index) {
+ return;
+ }
ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+ QUEUE_SHADER_SORTED_RAYS,
+#endif
kernel_split_state.queue_data,
kernel_split_params.queue_size,
0);
- char enqueue_flag = 0;
- if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
- enqueue_flag = 1;
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
}
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
- /* Continue on with shader evaluation. */
- if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
- Intersection isect = kernel_split_state.isect[ray_index];
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
RNG rng = kernel_split_state.rng[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- Ray ray = kernel_split_state.ray[ray_index];
- shader_setup_from_ray(kg,
- &kernel_split_state.sd[ray_index],
- &isect,
- &ray);
+#ifndef __BRANCHED_PATH__
float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+ ShaderContext ctx = SHADER_CONTEXT_MAIN;
+ float rbsdf = 0.0f;
+
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+ }
+
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ ctx = SHADER_CONTEXT_INDIRECT;
+ }
+
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+ shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif /* __BRANCHED_PATH__ */
+
kernel_split_state.rng[ray_index] = rng;
}
}
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+ /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(ray_index >= queue_index) {
+ return;
+ }
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+ /* Continue on with shader evaluation. */
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ Intersection isect = kernel_split_state.isect[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
+
+ shader_setup_from_ray(kg,
+ &kernel_split_state.sd[ray_index],
+ &isect,
+ &ray);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..297decb0bc2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+ ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+ int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(tid == 0) {
+ kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+ }
+
+ uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+ if(offset >= qsize) {
+ return;
+ }
+
+ int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+ uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+ uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+ ccl_local uint *local_value = &locals->local_value[0];
+ ccl_local ushort *local_index = &locals->local_index[0];
+
+ /* copy to local memory */
+ for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint add = input + idx;
+ uint value = (~0);
+ if(idx < qsize) {
+ int ray_index = kernel_split_state.queue_data[add];
+ bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(valid) {
+ value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+ }
+ }
+ local_value[i + lid] = value;
+ local_index[i + lid] = i + lid;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ /* skip sorting for cpu split kernel */
+# ifdef __KERNEL_OPENCL__
+
+ /* bitonic sort */
+ for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+ for (uint inc = length; inc > 0; inc >>= 1) {
+ for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+ uint i = lid + ii;
+ bool direction = ((i & (length << 1)) != 0);
+ uint j = i ^ inc;
+ ushort ioff = local_index[i];
+ ushort joff = local_index[j];
+ uint iKey = local_value[ioff];
+ uint jKey = local_value[joff];
+ bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+ bool swap = smaller ^ (j < i) ^ direction;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ local_index[i] = (swap) ? joff : ioff;
+ local_index[j] = (swap) ? ioff : joff;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ }
+ }
+ }
+# endif /* __KERNEL_OPENCL__ */
+
+ /* copy to destination */
+ for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint lidx = local_index[i + lid];
+ uint outi = output + idx;
+ uint ini = input + offset + lidx;
+ uint value = local_value[lidx];
+ if(idx < qsize) {
+ kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+ }
+ }
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 4243e18de72..474286285a9 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
}
- if(ray_index == QUEUE_EMPTY_SLOT)
+ if(ray_index == QUEUE_EMPTY_SLOT) {
return;
+ }
- /* Flag determining if we need to update L. */
- char update_path_radiance = 0;
-
- if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
- float3 shadow;
- Ray ray = *light_ray_global;
- update_path_radiance = !(shadow_blocked(kg,
- &kernel_split_state.sd_DL_shadow[ray_index],
- state,
- &ray,
- &shadow));
-
- *light_ray_global = ray;
- /* We use light_ray_global's P and t to store shadow and
- * update_path_radiance.
- */
- light_ray_global->P = shadow;
- light_ray_global->t = update_path_radiance;
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+ }
+ else {
+ kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
}
+#endif
+
+ kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index bb8f0157965..78e61709b01 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -29,31 +29,82 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
}
+#ifdef __BRANCHED_PATH__
+ /* TODO(mai): move this somewhere else? */
+ if(thread_index == 0) {
+ /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+ kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+ }
+#endif /* __BRANCHED_PATH__ */
+
if(ray_index == QUEUE_EMPTY_SLOT)
return;
- /* Flag determining if we need to update L. */
- char update_path_radiance = 0;
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.light_ray[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+
+ BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+ bool use_branched = false;
+ int all = 0;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ use_branched = true;
+ all = 1;
+ }
+# if defined(__BRANCHED_PATH__)
+ else if(kernel_data.integrator.branched) {
+ use_branched = true;
- if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+ if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ all = (kernel_data.integrator.sample_all_lights_indirect);
+ }
+ else
+ {
+ all = (kernel_data.integrator.sample_all_lights_direct);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+ if(use_branched) {
+ kernel_branched_path_surface_connect_light(kg,
+ &rng,
+ sd,
+ emission_sd,
+ state,
+ throughput,
+ 1.0f,
+ L,
+ all);
+ }
+ else
+# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+ {
+ /* trace shadow ray */
float3 shadow;
- Ray ray = *light_ray_global;
- update_path_radiance = !(shadow_blocked(kg,
- &kernel_split_state.sd_DL_shadow[ray_index],
- state,
- &ray,
- &shadow));
-
- *light_ray_global = ray;
- /* We use light_ray_global's P and t to store shadow and
- * update_path_radiance.
- */
- light_ray_global->P = shadow;
- light_ray_global->t = update_path_radiance;
+
+ if(!shadow_blocked(kg,
+ emission_sd,
+ state,
+ &ray,
+ &shadow))
+ {
+ /* accumulate */
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+ }
+ else {
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
+ }
}
+
+ kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 4303ba0a905..08f0124b529 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -37,41 +37,55 @@
#include "util/util_atomic.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-# include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+# include "kernel/kernel_path_branched.h"
#endif
-#ifdef __VOLUME__
-# include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+# include "kernel/split/kernel_branched.h"
+#endif
-#ifdef __KERNEL_DEBUG__
-# include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+ int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_accum_sample(orig_ray_L, L, 1);
+
+ atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+ }
+ else {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ }
+#else
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
#endif
+}
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 17e6587883a..eac22050a38 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -31,14 +31,6 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_
size = size SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */
-#endif
-
-#ifdef __VOLUME__
- size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */
-#endif
-
return size;
}
@@ -57,16 +49,6 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p;
- p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16);
-#endif
-
-#ifdef __VOLUME__
- split_data->state_shadow = (ccl_global PathState*)p;
- p += align_up(2 * num_elements * sizeof(PathState), 16);
-#endif
-
split_data->ray_state = ray_state;
}
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 748197b7183..4bb2f0d3d80 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -43,6 +43,9 @@ typedef struct SplitParams {
ccl_global char *use_queues_flag;
ccl_global float *buffer;
+
+ /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+ int dummy_sd_flag;
} SplitParams;
/* Global memory variables [porting]; These memory is used for
@@ -59,7 +62,64 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
#else
# define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+ /* various state that must be kept and restored after an indirect loop */
+ PathState path_state;
+ float3 throughput;
+ Ray ray;
+
+ struct ShaderData sd;
+ Intersection isect;
+
+ char ray_state;
+
+ /* indirect loop state */
+ int next_closure;
+ int next_sample;
+ int num_samples;
+
+#ifdef __SUBSURFACE__
+ int ss_next_closure;
+ int ss_next_sample;
+ int next_hit;
+ int num_hits;
+
+ uint lcg_state;
+ SubsurfaceIntersection ss_isect;
+
+# ifdef __VOLUME__
+ VolumeStack volume_stack[VOLUME_STACK_SIZE];
+# endif /* __VOLUME__ */
+#endif /*__SUBSURFACE__ */
+
+ int shared_sample_count; /* number of branched samples shared with other threads */
+ int original_ray; /* index of original ray when sharing branched samples */
+ bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+# define SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+# define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+# define SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+# define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
#define SPLIT_DATA_ENTRIES \
SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -69,9 +129,6 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
- SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
- SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -79,6 +136,28 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_DEBUG_ENTRIES \
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
SPLIT_DATA_DEBUG_ENTRIES \
/* struct that holds pointers to data in the shared state buffer */
@@ -87,14 +166,6 @@ typedef struct SplitData {
SPLIT_DATA_ENTRIES
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- ccl_global SubsurfaceIndirectRays *ss_rays;
-#endif
-
-#ifdef __VOLUME__
- ccl_global PathState *state_shadow;
-#endif
-
/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
* the host easily) but is still used the same as the other data so we have it here in this struct as well
*/
@@ -122,6 +193,11 @@ typedef struct BackgroundAOLocals {
uint queue_atomics_ao;
} BackgroundAOLocals;
+typedef struct ShaderSortLocals {
+ uint local_value[SHADER_SORT_BLOCK_SIZE];
+ ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
CCL_NAMESPACE_END
#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 0b4d50c70ee..d5083b23f80 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -16,82 +16,306 @@
CCL_NAMESPACE_BEGIN
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
- ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
{
-#ifdef __SUBSURFACE__
- if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ branched_state->ss_next_closure = 0;
+ branched_state->ss_next_sample = 0;
+
+ branched_state->num_hits = 0;
+ branched_state->next_hit = 0;
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &branched_state->sd;
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSSRDF(sc->type))
+ continue;
+
+ /* set up random number generator */
+ if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+ branched_state->next_closure == 0 && branched_state->next_sample == 0)
+ {
+ branched_state->lcg_state = lcg_state_init(&rng,
+ branched_state->path_state.rng_offset,
+ branched_state->path_state.sample,
+ 0x68bc21eb);
+ }
+ int num_samples = kernel_data.integrator.subsurface_samples;
+ float num_samples_inv = 1.0f/num_samples;
+ RNG bssrdf_rng = cmj_hash(rng, i);
+
+ /* do subsurface scatter step with copy of shader data, this will
+ * replace the BSSRDF with a diffuse BSDF closure */
+ for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+ ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+ float bssrdf_u, bssrdf_v;
+ path_branched_rng_2D(kg,
+ &bssrdf_rng,
+ &branched_state->path_state,
+ j,
+ num_samples,
+ PRNG_BSDF_U,
+ &bssrdf_u,
+ &bssrdf_v);
+
+ /* intersection is expensive so avoid doing multiple times for the same input */
+ if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ RNG lcg_state = branched_state->lcg_state;
+ SubsurfaceIntersection ss_isect_private;
+
+ branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+ &ss_isect_private,
+ sd,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ true);
+
+ branched_state->lcg_state = lcg_state;
+ *ss_isect = ss_isect_private;
+ }
+
+#ifdef __VOLUME__
+ Ray volume_ray = branched_state->ray;
+ bool need_update_volume_stack =
+ kernel_data.integrator.use_volumes &&
+ sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif /* __VOLUME__ */
+
+ /* compute lighting with the BSDF closure */
+ for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+ ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+ *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+ * important as the indirect path will write into bssrdf_sd */
+
+ SubsurfaceIntersection ss_isect_private = *ss_isect;
+ subsurface_scatter_multi_setup(kg,
+ &ss_isect_private,
+ hit,
+ bssrdf_sd,
+ &branched_state->path_state,
+ branched_state->path_state.flag,
+ sc,
+ true);
+ *ss_isect = ss_isect_private;
+
+ ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+ *hit_state = branched_state->path_state;
+
+ path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+ if(need_update_volume_stack) {
+ /* Setup ray from previous surface point to the new one. */
+ float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+ volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+ /* this next part is expensive as it does scene intersection so only do once */
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ branched_state->volume_stack[k] = hit_state->volume_stack[k];
+ }
+
+ kernel_volume_stack_update_for_subsurface(kg,
+ emission_sd,
+ &volume_ray,
+ branched_state->volume_stack);
+ }
+
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ hit_state->volume_stack[k] = branched_state->volume_stack[k];
+ }
+ }
+#endif /* __VOLUME__ */
+
+#ifdef __EMISSION__
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ int all = (kernel_data.integrator.sample_all_lights_direct) ||
+ (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_branched_path_surface_connect_light(kg,
+ &rng,
+ bssrdf_sd,
+ emission_sd,
+ hit_state,
+ branched_state->throughput,
+ num_samples_inv,
+ L,
+ all);
+ }
+ }
+#endif /* __EMISSION__ */
+
+ /* indirect light */
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ num_samples_inv,
+ bssrdf_sd,
+ false,
+ false))
+ {
+ branched_state->ss_next_closure = i;
+ branched_state->ss_next_sample = j;
+ branched_state->next_hit = hit;
+
+ return true;
+ }
+
+ branched_state->next_closure = 0;
+ }
+
+ branched_state->next_hit = 0;
+ }
+
+ branched_state->ss_next_sample = 0;
+ }
+
+ branched_state->ss_next_closure = sd->num_closure;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index == 0) {
+ /* We will empty both queues in this kernel. */
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
}
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
kernel_split_params.queue_size,
- 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
- char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+ 1);
+ get_ray_index(kg, thread_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+#ifdef __SUBSURFACE__
ccl_global char *ray_state = kernel_split_state.ray_state;
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- ShaderData *sd = &kernel_split_state.sd[ray_index];
- ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
if(sd->flag & SD_BSSRDF) {
- if(kernel_path_subsurface_scatter(kg,
- sd,
- emission_sd,
- L,
- state,
- &rng,
- ray,
- throughput,
- ss_indirect)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(kernel_path_subsurface_scatter(kg,
+ sd,
+ emission_sd,
+ L,
+ state,
+ &rng,
+ ray,
+ throughput,
+ ss_indirect))
+ {
+ kernel_split_path_end(kg, ray_index);
+ }
+#ifdef __BRANCHED_PATH__
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ float bssrdf_probability;
+ ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+ /* modify throughput for picking bssrdf or bsdf */
+ *throughput *= bssrdf_probability;
+
+ /* do bssrdf scatter step if we picked a bssrdf closure */
+ if(sc) {
+ uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ &rng,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
+ subsurface_scatter_step(kg,
+ sd,
+ state,
+ state->flag,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ false);
+ }
+ }
+ else {
+ kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
}
+#endif
}
kernel_split_state.rng[ray_index] = rng;
}
-#ifndef __COMPUTE_DEVICE_GPU__
+# ifdef __BRANCHED_PATH__
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
}
-#endif
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
#endif /* __SUBSURFACE__ */
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 1885e1af851..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -76,6 +76,345 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
switch(type) {
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_ID: {
+ uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
+ sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
+ anisotropic_rotation_offset, transmission_roughness_offset;
+ uint4 data_node2 = read_node(kg, offset);
+
+ float3 T = stack_load_float3(stack, data_node.y);
+ decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
+ decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
+ decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
+
+ // get Disney principled parameters
+ float metallic = param1;
+ float subsurface = param2;
+ float specular = stack_load_float(stack, specular_offset);
+ float roughness = stack_load_float(stack, roughness_offset);
+ float specular_tint = stack_load_float(stack, specular_tint_offset);
+ float anisotropic = stack_load_float(stack, anisotropic_offset);
+ float sheen = stack_load_float(stack, sheen_offset);
+ float sheen_tint = stack_load_float(stack, sheen_tint_offset);
+ float clearcoat = stack_load_float(stack, clearcoat_offset);
+ float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
+ float transmission = stack_load_float(stack, transmission_offset);
+ float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
+ float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
+ float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
+
+ ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+
+ /* rotate tangent */
+ if(anisotropic_rotation != 0.0f)
+ T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
+
+ /* calculate ior */
+ float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
+
+ // calculate fresnel for refraction
+ float cosNO = dot(N, sd->I);
+ float fresnel = fresnel_dielectric_cos(cosNO, ior);
+
+ // calculate weights of the diffuse and specular part
+ float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
+
+ float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+ float specular_weight = (1.0f - final_transmission);
+
+ // get the base color
+ uint4 data_base_color = read_node(kg, offset);
+ float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) :
+ make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w));
+
+ // get the additional clearcoat normal and subsurface scattering radius
+ uint4 data_cn_ssr = read_node(kg, offset);
+ float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
+ float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
+
+ // get the subsurface color
+ uint4 data_subsurface_color = read_node(kg, offset);
+ float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
+ make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
+
+ float3 weight = sd->svm_closure_weight * mix_weight;
+
+#ifdef __SUBSURFACE__
+ float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+ float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+ float subsurf_sample_weight = fabsf(average(subsurf_weight));
+
+ /* disable in case of diffuse ancestor, can't see it well then and
+ * adds considerably noise due to probabilities of continuing path
+ * getting lower and lower */
+ if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+ subsurface = 0.0f;
+
+ /* need to set the base color in this case such that the
+ * rays get the correctly mixed color after transmitting
+ * the object */
+ base_color = mixed_ss_base_color;
+ }
+
+ /* diffuse */
+ if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+ if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+ float3 diff_weight = weight * base_color * diffuse_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+ }
+ }
+ else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
+ /* radius * scale */
+ float3 radius = subsurface_radius * subsurface;
+ /* sharpness */
+ float sharpness = 0.0f;
+ /* texture color blur */
+ float texture_blur = 0.0f;
+
+ /* create one closure per color channel */
+ Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.x;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.x;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+
+ bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.y;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.y;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+
+ bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.z;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.z;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+ }
+ }
+#else
+ /* diffuse */
+ if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+ float3 diff_weight = weight * base_color * diffuse_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+ }
+ }
+#endif
+
+ /* sheen */
+ if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
+ float m_cdlum = linear_rgb_to_gray(base_color);
+ float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat
+
+ /* color of the sheen component */
+ float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint;
+
+ float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+
+ PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_sheen_setup(bsdf);
+ }
+ }
+
+ /* specular reflection */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
+ float3 spec_weight = weight * specular_weight;
+
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
+ bsdf->T = T;
+ bsdf->extra = extra;
+
+ float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
+ float r2 = roughness * roughness;
+
+ bsdf->alpha_x = r2 / aspect;
+ bsdf->alpha_y = r2 * aspect;
+
+ float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
+ float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
+ float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint;
+
+ bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic;
+ bsdf->extra->color = base_color;
+
+ /* setup bsdf */
+ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
+ sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+ else /* use multi-scatter GGX */
+ sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ /* BSDF */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+ float3 glass_weight = weight * final_transmission;
+ float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+
+ if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
+ float refl_roughness = roughness;
+
+ /* reflection */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->extra = extra;
+
+ bsdf->alpha_x = refl_roughness * refl_roughness;
+ bsdf->alpha_y = refl_roughness * refl_roughness;
+ bsdf->ior = ior;
+
+ bsdf->extra->color = base_color;
+ bsdf->extra->cspec0 = cspec0;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+ }
+ }
+
+ /* refraction */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel));
+
+ if(bsdf) {
+ bsdf->N = N;
+
+ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
+ transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
+ else
+ transmission_roughness = refl_roughness;
+
+ bsdf->alpha_x = transmission_roughness * transmission_roughness;
+ bsdf->alpha_y = transmission_roughness * transmission_roughness;
+ bsdf->ior = ior;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+ }
+ }
+ }
+ else { /* use multi-scatter GGX */
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->extra = extra;
+ bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+
+ bsdf->alpha_x = roughness * roughness;
+ bsdf->alpha_y = roughness * roughness;
+ bsdf->ior = ior;
+
+ bsdf->extra->color = base_color;
+ bsdf->extra->cspec0 = cspec0;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+ }
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ /* clearcoat */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(clearcoat > CLOSURE_WEIGHT_CUTOFF) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = clearcoat_normal;
+ bsdf->ior = 1.5f;
+ bsdf->extra = extra;
+
+ bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+ bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+
+ bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+ bsdf->extra->clearcoat = clearcoat;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ break;
+ }
+#endif /* __PRINCIPLED__ */
case CLOSURE_BSDF_DIFFUSE_ID: {
float3 weight = sd->svm_closure_weight * mix_weight;
OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
@@ -110,6 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@@ -344,6 +684,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
#ifdef __CAUSTICS_TRICKS__
if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
+ ATTR_FALLTHROUGH;
#endif
case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
float3 weight = sd->svm_closure_weight * mix_weight;
@@ -370,6 +711,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we
@@ -383,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
if(bsdf) {
+ bsdf->N = N;
bsdf->roughness1 = param1;
bsdf->roughness2 = param2;
bsdf->offset = -stack_load_float(stack, data_node.z);
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
- float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
- normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+ if(is_zero(normal_out)) {
+ normal_out = normal_in;
+ }
+ else {
+ normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ }
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
+ default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 76acc9253a1..7be03dcd65a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,29 +16,10 @@
CCL_NAMESPACE_BEGIN
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-# if __CUDA_ARCH__ < 300
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA
-# else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER
-# endif
-#else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL
-#endif
-
ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
{
#ifdef __KERNEL_CPU__
-# ifdef __KERNEL_SSE2__
- ssef r_ssef;
- float4 &r = (float4 &)r_ssef;
- r = kernel_tex_image_interp(id, x, y);
-# else
float4 r = kernel_tex_image_interp(id, x, y);
-# endif
#elif defined(__KERNEL_OPENCL__)
float4 r = kernel_tex_image_interp(kg, id, x, y);
#else
@@ -56,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
switch(id) {
case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
- case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
- case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
- case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
- case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
- case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
- case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
- case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
- case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
+ case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
+ case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
+ case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
+ case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
+ case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
- case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
- case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
- case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
- case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
- case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
- case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
- case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
- case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
- case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
- case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
- case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
- case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
- case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
- case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
- case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
- case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
- case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
- case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
- case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
- case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
- case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
- case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
- case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
- case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
- case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
- case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
- case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
- case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
- case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
- case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
- case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
- case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
- case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
- case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
- case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
- case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
- case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
- case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
- case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
- case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
- case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
- case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
- case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
- case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
- case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
- case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
- case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
- case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
- case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
- case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
- case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
- case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
- case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
- case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
- case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
- case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
- case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
- case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
- case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
- case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
- case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
- case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
- case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
- case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
- case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
- case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
- case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
- case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
- case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
- case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
+ case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
+ case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
+ case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
+ case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
+ case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
+ case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
+ case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
+ case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
+ case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
+ case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
+ case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
+ case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
+ case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
+ case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
+ case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
+ case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
+ case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
+ case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
+ case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
+ case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
+ case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
+ case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
+ case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
+ case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
+ case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
+ case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
+ case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
+ case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
+ case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
+ case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
+ case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
+ case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
+ case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
+ case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
+ case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
+ case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
+ case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
+ case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
+ case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
+ case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
+ case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
+ case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
+ case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
+ case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
+ case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
+ case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
+ case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
+ case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
+ case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
+ case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
+ case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
+ case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
+ case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
+ case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
+ case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
+ case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
+ case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
+ case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
+ case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
+ case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
+ case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
+ case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
+ case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
+ case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
+ case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
+ case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
+ case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
+ case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
+ case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
+ case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
+ case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
+ case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
+ case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
default:
kernel_assert(0);
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -151,8 +132,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
# else
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
/* float4, byte4 and half4 */
- if(id < TEX_START_FLOAT_CUDA_KEPLER)
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4)
+ {
r = kernel_tex_image_interp_float4(tex, x, y);
+ }
/* float, byte and half */
else {
float f = kernel_tex_image_interp_float(tex, x, y);
@@ -161,40 +147,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
# endif
#endif
-#ifdef __KERNEL_SSE2__
- float alpha = r.w;
+ const float alpha = r.w;
if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
- r_ssef = r_ssef / ssef(alpha);
- if(id >= TEX_NUM_FLOAT4_IMAGES)
- r_ssef = min(r_ssef, ssef(1.0f));
- r.w = alpha;
- }
-
- if(srgb) {
- r_ssef = color_srgb_to_scene_linear(r_ssef);
- r.w = alpha;
- }
-#else
- if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
- float invw = 1.0f/r.w;
- r.x *= invw;
- r.y *= invw;
- r.z *= invw;
-
- if(id >= TEX_NUM_FLOAT4_IMAGES) {
- r.x = min(r.x, 1.0f);
- r.y = min(r.y, 1.0f);
- r.z = min(r.z, 1.0f);
+ r /= alpha;
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE)
+ {
+ r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
}
+ r.w = alpha;
}
if(srgb) {
- r.x = color_srgb_to_scene_linear(r.x);
- r.y = color_srgb_to_scene_linear(r.y);
- r.z = color_srgb_to_scene_linear(r.z);
+ r = color_srgb_to_scene_linear_v4(r);
}
-#endif
return r;
}
@@ -336,8 +304,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
- co = normalize(co);
-
+ co = safe_normalize(co);
+
if(projection == 0)
uv = direction_to_equirectangular(co);
else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 47209ddfbab..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -397,17 +397,23 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_ID,
CLOSURE_BSDF_OREN_NAYAR_ID,
CLOSURE_BSDF_DIFFUSE_RAMP_ID,
+ CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
+ CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
- CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
@@ -416,24 +422,26 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
- CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
- CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
/* Special cases */
CLOSURE_BSDF_BSSRDF_ID,
+ CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID,
CLOSURE_BSDF_TRANSPARENT_ID,
/* BSSRDF */
CLOSURE_BSSRDF_CUBIC_ID,
CLOSURE_BSSRDF_GAUSSIAN_ID,
+ CLOSURE_BSSRDF_PRINCIPLED_ID,
CLOSURE_BSSRDF_BURLEY_ID,
/* Other */
@@ -447,19 +455,24 @@ typedef enum ClosureType {
CLOSURE_VOLUME_ABSORPTION_ID,
CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID,
+ CLOSURE_BSDF_PRINCIPLED_ID,
+
NBUILTIN_CLOSURES
} ClosureType;
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
-#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
- type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+ (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -468,7 +481,8 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 9e826c8c23f..f4a5b2b2994 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -46,8 +46,13 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
# if defined(__KERNEL_CUDA__)
# if __CUDA_ARCH__ >= 300
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
- if(id < TEX_START_HALF4_CUDA_KEPLER)
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4)
+ {
r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
+ }
else {
float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
r = make_float4(f, f, f, 1.0f);
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fe2c2e78926..cf402c3f214 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
full_width = 0;
full_height = 0;
+ denoising_data_pass = false;
+ denoising_clean_pass = false;
+
Pass::add(PASS_COMBINED, passes);
}
@@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
for(size_t i = 0; i < passes.size(); i++)
size += passes[i].components;
-
+
+ if(denoising_data_pass) {
+ size += DENOISING_PASS_SIZE_BASE;
+ if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+ }
+
return align_up(size, 4);
}
+int BufferParams::get_denoising_offset()
+{
+ int offset = 0;
+
+ for(size_t i = 0; i < passes.size(); i++)
+ offset += passes[i].components;
+
+ return offset;
+}
+
/* Render Buffer Task */
RenderTile::RenderTile()
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
}
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(Device *from_device)
{
if(!buffer.device_pointer)
return false;
- device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+ if(!from_device) {
+ from_device = device;
+ }
+
+ from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+
+ return true;
+}
+
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+{
+ float scale = 1.0f/sample;
+
+ if(offset == DENOISING_PASS_COLOR) {
+ scale *= exposure;
+ }
+ else if(offset == DENOISING_PASS_COLOR_VAR) {
+ scale *= exposure*exposure;
+ }
+
+ offset += params.get_denoising_offset();
+ float *in = (float*)buffer.data_pointer + offset;
+ int pass_stride = params.get_passes_size();
+ int size = params.width*params.height;
+
+ if(components == 1) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ pixels[0] = in[0]*scale;
+ }
+ }
+ else if(components == 3) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
+ }
+ }
+ else {
+ return false;
+ }
return true;
}
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 5c78971678a..e56556c8abe 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -51,6 +51,9 @@ public:
/* passes */
array<Pass> passes;
+ bool denoising_data_pass;
+ /* If only some light path types should be denoised, an additional pass is needed. */
+ bool denoising_clean_pass;
/* functions */
BufferParams();
@@ -59,6 +62,7 @@ public:
bool modified(const BufferParams& params);
void add_pass(PassType type);
int get_passes_size();
+ int get_denoising_offset();
};
/* Render Buffers */
@@ -73,18 +77,19 @@ public:
/* random number generator state */
device_vector<uint> rng_state;
+ Device *device;
+
explicit RenderBuffers(Device *device);
~RenderBuffers();
void reset(Device *device, BufferParams& params);
- bool copy_from_device();
+ bool copy_from_device(Device *from_device = NULL);
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+ bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
protected:
void device_free();
-
- Device *device;
};
/* Display Buffer
@@ -131,6 +136,9 @@ protected:
class RenderTile {
public:
+ typedef enum { PATH_TRACE, DENOISE } Task;
+
+ Task task;
int x, y, w, h;
int start_sample;
int num_samples;
@@ -138,6 +146,7 @@ public:
int resolution;
int offset;
int stride;
+ int tile_index;
device_ptr buffer;
device_ptr rng_state;
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index 2569d9eec27..943b218f0e4 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp)
bypass(input->link);
return true;
}
+ else {
+ /* disconnect other inputs if we can't fully bypass due to clamp */
+ foreach(ShaderInput *other, node->inputs) {
+ if(other != input && other->link) {
+ graph->disconnect(other);
+ }
+ }
+ }
return false;
}
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 7809f4345f1..c8213d258d5 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
+ SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
+ SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+ SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
return type;
}
@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride += pass.components;
}
+ kfilm->pass_denoising_data = 0;
+ kfilm->pass_denoising_clean = 0;
+ kfilm->denoising_flags = 0;
+ if(denoising_data_pass) {
+ kfilm->pass_denoising_data = kfilm->pass_stride;
+ kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+ kfilm->denoising_flags = denoising_flags;
+ if(denoising_clean_pass) {
+ kfilm->pass_denoising_clean = kfilm->pass_stride;
+ kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+ kfilm->use_light_pass = 1;
+ }
+ }
+
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
kfilm->pass_alpha_threshold = pass_alpha_threshold;
@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
kfilm->mist_falloff = mist_falloff;
+ pass_stride = kfilm->pass_stride;
+ denoising_data_offset = kfilm->pass_denoising_data;
+ denoising_clean_offset = kfilm->pass_denoising_clean;
+
need_update = false;
}
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 83c941d5c57..29b1e7e9157 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -57,8 +57,15 @@ public:
float exposure;
array<Pass> passes;
+ bool denoising_data_pass;
+ bool denoising_clean_pass;
+ int denoising_flags;
float pass_alpha_threshold;
+ int pass_stride;
+ int denoising_data_offset;
+ int denoising_clean_offset;
+
FilterType filter_type;
float filter_width;
size_t filter_table_offset;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 12fff8e5587..2d810ff664f 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -423,7 +423,8 @@ void ShaderGraph::copy_nodes(ShaderNodeSet& nodes, ShaderNodeMap& nnodemap)
/* Graph simplification */
/* ******************** */
-/* Step 1: Remove proxy nodes.
+/* Remove proxy nodes.
+ *
* These only exists temporarily when exporting groups, and we must remove them
* early so that node->attributes() and default links do not see them.
*/
@@ -493,7 +494,8 @@ void ShaderGraph::remove_proxy_nodes()
}
}
-/* Step 2: Constant folding.
+/* Constant folding.
+ *
* Try to constant fold some nodes, and pipe result directly to
* the input socket of connected nodes.
*/
@@ -554,7 +556,7 @@ void ShaderGraph::constant_fold()
}
}
-/* Step 3: Simplification. */
+/* Simplification. */
void ShaderGraph::simplify_settings(Scene *scene)
{
foreach(ShaderNode *node, nodes) {
@@ -562,7 +564,7 @@ void ShaderGraph::simplify_settings(Scene *scene)
}
}
-/* Step 4: Deduplicate nodes with same settings. */
+/* Deduplicate nodes with same settings. */
void ShaderGraph::deduplicate_nodes()
{
/* NOTES:
@@ -638,6 +640,48 @@ void ShaderGraph::deduplicate_nodes()
}
}
+/* Check whether volume output has meaningful nodes, otherwise
+ * disconnect the output.
+ */
+void ShaderGraph::verify_volume_output()
+{
+ /* Check whether we can optimize the whole volume graph out. */
+ ShaderInput *volume_in = output()->input("Volume");
+ if(volume_in->link == NULL) {
+ return;
+ }
+ bool has_valid_volume = false;
+ ShaderNodeSet scheduled;
+ queue<ShaderNode*> traverse_queue;
+ /* Schedule volume output. */
+ traverse_queue.push(volume_in->link->parent);
+ scheduled.insert(volume_in->link->parent);
+ /* Traverse down the tree. */
+ while(!traverse_queue.empty()) {
+ ShaderNode *node = traverse_queue.front();
+ traverse_queue.pop();
+ /* Node is fully valid for volume, can't optimize anything out. */
+ if(node->has_volume_support()) {
+ has_valid_volume = true;
+ break;
+ }
+ foreach(ShaderInput *input, node->inputs) {
+ if(input->link == NULL) {
+ continue;
+ }
+ if(scheduled.find(input->link->parent) != scheduled.end()) {
+ continue;
+ }
+ traverse_queue.push(input->link->parent);
+ scheduled.insert(input->link->parent);
+ }
+ }
+ if(!has_valid_volume) {
+ VLOG(1) << "Disconnect meaningless volume output.";
+ disconnect(volume_in->link);
+ }
+}
+
void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack)
{
visited[node->id] = true;
@@ -666,16 +710,11 @@ void ShaderGraph::clean(Scene *scene)
{
/* Graph simplification */
- /* 1: Remove proxy nodes was already done. */
-
- /* 2: Constant folding. */
+ /* NOTE: Remove proxy nodes was already done. */
constant_fold();
-
- /* 3: Simplification. */
simplify_settings(scene);
-
- /* 4: De-duplication. */
deduplicate_nodes();
+ verify_volume_output();
/* we do two things here: find cycles and break them, and remove unused
* nodes that don't feed into the output. how cycles are broken is
@@ -998,6 +1037,9 @@ int ShaderGraph::get_num_closures()
else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) {
num_closures += 2;
}
+ else if(CLOSURE_IS_PRINCIPLED(closure_type)) {
+ num_closures += 8;
+ }
else {
++num_closures;
}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 09932695d1f..72e391991a7 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -155,7 +155,7 @@ public:
virtual bool has_spatial_varying() { return false; }
virtual bool has_object_dependency() { return false; }
virtual bool has_integrator_dependency() { return false; }
-
+ virtual bool has_volume_support() { return false; }
vector<ShaderInput*> inputs;
vector<ShaderOutput*> outputs;
@@ -284,6 +284,7 @@ protected:
void constant_fold();
void simplify_settings(Scene *scene);
void deduplicate_nodes();
+ void verify_volume_output();
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index a8c4f446bea..02b65440154 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -30,6 +30,16 @@
CCL_NAMESPACE_BEGIN
+/* Some helpers to silence warning in templated function. */
+static bool isfinite(uchar /*value*/)
+{
+ return false;
+}
+static bool isfinite(half /*value*/)
+{
+ return false;
+}
+
ImageManager::ImageManager(const DeviceInfo& info)
{
need_update = true;
@@ -49,54 +59,24 @@ ImageManager::ImageManager(const DeviceInfo& info)
}
/* Set image limits */
-#define SET_TEX_IMAGES_LIMITS(ARCH) \
- { \
- tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_ ## ARCH; \
- tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_ ## ARCH; \
- tex_num_images[IMAGE_DATA_TYPE_HALF4] = TEX_NUM_HALF4_ ## ARCH; \
- tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_ ## ARCH; \
- tex_num_images[IMAGE_DATA_TYPE_BYTE] = TEX_NUM_BYTE_ ## ARCH; \
- tex_num_images[IMAGE_DATA_TYPE_HALF] = TEX_NUM_HALF_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_START_FLOAT4_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_BYTE4] = TEX_START_BYTE4_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_HALF4] = TEX_START_HALF4_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_FLOAT] = TEX_START_FLOAT_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_BYTE] = TEX_START_BYTE_ ## ARCH; \
- tex_start_images[IMAGE_DATA_TYPE_HALF] = TEX_START_HALF_ ## ARCH; \
- }
-
- if(device_type == DEVICE_CPU) {
- SET_TEX_IMAGES_LIMITS(CPU);
- }
- else if(device_type == DEVICE_CUDA) {
- if(info.has_bindless_textures) {
- SET_TEX_IMAGES_LIMITS(CUDA_KEPLER);
- }
- else {
- SET_TEX_IMAGES_LIMITS(CUDA);
+ max_num_images = TEX_NUM_MAX;
+ has_half_images = true;
+ cuda_fermi_limits = false;
+
+ if(device_type == DEVICE_CUDA) {
+ if(!info.has_bindless_textures) {
+ /* CUDA Fermi hardware (SM 2.x) has a hard limit on the number of textures */
+ cuda_fermi_limits = true;
+ has_half_images = false;
}
}
else if(device_type == DEVICE_OPENCL) {
- SET_TEX_IMAGES_LIMITS(OPENCL);
- }
- else {
- /* Should not happen. */
- tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
- tex_num_images[IMAGE_DATA_TYPE_BYTE4] = 0;
- tex_num_images[IMAGE_DATA_TYPE_HALF4] = 0;
- tex_num_images[IMAGE_DATA_TYPE_FLOAT] = 0;
- tex_num_images[IMAGE_DATA_TYPE_BYTE] = 0;
- tex_num_images[IMAGE_DATA_TYPE_HALF] = 0;
- tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
- tex_start_images[IMAGE_DATA_TYPE_BYTE4] = 0;
- tex_start_images[IMAGE_DATA_TYPE_HALF4] = 0;
- tex_start_images[IMAGE_DATA_TYPE_FLOAT] = 0;
- tex_start_images[IMAGE_DATA_TYPE_BYTE] = 0;
- tex_start_images[IMAGE_DATA_TYPE_HALF] = 0;
- assert(0);
+ has_half_images = false;
}
-#undef SET_TEX_IMAGES_LIMITS
+ for(size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+ tex_num_images[type] = 0;
+ }
}
ImageManager::~ImageManager()
@@ -133,18 +113,20 @@ bool ImageManager::set_animation_frame_update(int frame)
return false;
}
-ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filename,
- void *builtin_data,
- bool& is_linear)
+ImageDataType ImageManager::get_image_metadata(const string& filename,
+ void *builtin_data,
+ bool& is_linear,
+ bool& builtin_free_cache)
{
bool is_float = false, is_half = false;
is_linear = false;
+ builtin_free_cache = false;
int channels = 4;
if(builtin_data) {
if(builtin_image_info_cb) {
int width, height, depth;
- builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels);
+ builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels, builtin_free_cache);
}
if(is_float) {
@@ -226,26 +208,28 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen
}
}
-/* We use a consecutive slot counting scheme on the devices, in order
- * float4, byte4, half4, float, byte, half.
+int ImageManager::max_flattened_slot(ImageDataType type)
+{
+ if(tex_num_images[type] == 0) {
+ /* No textures for the type, no slots needs allocation. */
+ return 0;
+ }
+ return type_index_to_flattened_slot(tex_num_images[type], type);
+}
+
+/* The lower three bits of a device texture slot number indicate its type.
* These functions convert the slot ids from ImageManager "images" ones
- * to device ones and vice versa. */
+ * to device ones and vice verse.
+ */
int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type)
{
- return slot + tex_start_images[type];
+ return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
}
int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
{
- for(int i = IMAGE_DATA_NUM_TYPES - 1; i >= 0; i--) {
- if(flat_slot >= tex_start_images[i]) {
- *type = (ImageDataType)i;
- return flat_slot - tex_start_images[i];
- }
- }
-
- /* Should not happen. */
- return flat_slot;
+ *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
+ return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
}
string ImageManager::name_from_type(int type)
@@ -290,8 +274,9 @@ int ImageManager::add_image(const string& filename,
{
Image *img;
size_t slot;
+ bool builtin_free_cache;
- ImageDataType type = get_image_metadata(filename, builtin_data, is_linear);
+ ImageDataType type = get_image_metadata(filename, builtin_data, is_linear, builtin_free_cache);
thread_scoped_lock device_lock(device_mutex);
@@ -299,14 +284,22 @@ int ImageManager::add_image(const string& filename,
is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4);
/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
- if((type == IMAGE_DATA_TYPE_FLOAT ||
- type == IMAGE_DATA_TYPE_HALF4 ||
- type == IMAGE_DATA_TYPE_HALF) &&
- tex_num_images[type] == 0) {
- type = IMAGE_DATA_TYPE_FLOAT4;
+ if(!has_half_images) {
+ if(type == IMAGE_DATA_TYPE_HALF4) {
+ type = IMAGE_DATA_TYPE_FLOAT4;
+ }
+ else if(type == IMAGE_DATA_TYPE_HALF) {
+ type = IMAGE_DATA_TYPE_FLOAT;
+ }
}
- if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) {
- type = IMAGE_DATA_TYPE_BYTE4;
+
+ if(cuda_fermi_limits) {
+ if(type == IMAGE_DATA_TYPE_FLOAT) {
+ type = IMAGE_DATA_TYPE_FLOAT4;
+ }
+ else if(type == IMAGE_DATA_TYPE_BYTE) {
+ type = IMAGE_DATA_TYPE_BYTE4;
+ }
}
/* Fnd existing image. */
@@ -338,14 +331,30 @@ int ImageManager::add_image(const string& filename,
break;
}
- if(slot == images[type].size()) {
- /* Max images limit reached. */
- if(images[type].size() == tex_num_images[type]) {
+ /* Count if we're over the limit */
+ if(cuda_fermi_limits) {
+ if(tex_num_images[IMAGE_DATA_TYPE_BYTE4] == TEX_NUM_BYTE4_CUDA
+ || tex_num_images[IMAGE_DATA_TYPE_FLOAT4] == TEX_NUM_FLOAT4_CUDA)
+ {
printf("ImageManager::add_image: Reached %s image limit (%d), skipping '%s'\n",
- name_from_type(type).c_str(), tex_num_images[type], filename.c_str());
+ name_from_type(type).c_str(), tex_num_images[type], filename.c_str());
return -1;
}
+ }
+ else {
+ /* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
+ int tex_count = 0;
+ for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+ tex_count += tex_num_images[type];
+ }
+ if(tex_count > max_num_images) {
+ printf("ImageManager::add_image: Reached image limit (%d), skipping '%s'\n",
+ max_num_images, filename.c_str());
+ return -1;
+ }
+ }
+ if(slot == images[type].size()) {
images[type].resize(images[type].size() + 1);
}
@@ -353,6 +362,7 @@ int ImageManager::add_image(const string& filename,
img = new Image();
img->filename = filename;
img->builtin_data = builtin_data;
+ img->builtin_free_cache = builtin_free_cache;
img->need_load = true;
img->animated = animated;
img->frame = frame;
@@ -363,6 +373,8 @@ int ImageManager::add_image(const string& filename,
images[type][slot] = img;
+ ++tex_num_images[type];
+
need_update = true;
return type_index_to_flattened_slot(slot, type);
@@ -436,7 +448,12 @@ void ImageManager::tag_reload_image(const string& filename,
}
}
-bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components)
+bool ImageManager::file_load_image_generic(Image *img,
+ ImageInput **in,
+ int &width,
+ int &height,
+ int &depth,
+ int &components)
{
if(img->filename == "")
return false;
@@ -475,8 +492,8 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
if(!builtin_image_info_cb || !builtin_image_pixels_cb)
return false;
- bool is_float;
- builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
+ bool is_float, free_cache;
+ builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components, free_cache);
}
/* we only handle certain number of components */
@@ -557,13 +574,15 @@ bool ImageManager::file_load_image(Image *img,
builtin_image_float_pixels_cb(img->filename,
img->builtin_data,
(float*)&pixels[0],
- num_pixels * components);
+ num_pixels * components,
+ img->builtin_free_cache);
}
else if(FileFormat == TypeDesc::UINT8) {
builtin_image_pixels_cb(img->filename,
img->builtin_data,
(uchar*)&pixels[0],
- num_pixels * components);
+ num_pixels * components,
+ img->builtin_free_cache);
}
else {
/* TODO(dingto): Support half for ImBuf. */
@@ -618,6 +637,37 @@ bool ImageManager::file_load_image(Image *img,
}
}
}
+ /* Make sure we don't have buggy values. */
+ if(FileFormat == TypeDesc::FLOAT) {
+ /* For RGBA buffers we put all channels to 0 if either of them is not
+ * finite. This way we avoid possible artifacts caused by fully changed
+ * hue.
+ */
+ if(is_rgba) {
+ for(size_t i = 0; i < num_pixels; i += 4) {
+ StorageType *pixel = &pixels[i*4];
+ if(!isfinite(pixel[0]) ||
+ !isfinite(pixel[1]) ||
+ !isfinite(pixel[2]) ||
+ !isfinite(pixel[3]))
+ {
+ pixel[0] = 0;
+ pixel[1] = 0;
+ pixel[2] = 0;
+ pixel[3] = 0;
+ }
+ }
+ }
+ else {
+ for(size_t i = 0; i < num_pixels; ++i) {
+ StorageType *pixel = &pixels[i];
+ if(!isfinite(pixel[0])) {
+ pixel[0] = 0;
+ }
+ }
+ }
+ }
+ /* Scale image down if needed. */
if(pixels_storage.size() > 0) {
float scale_factor = 1.0f;
while(max_size * scale_factor > texture_limit) {
@@ -666,16 +716,12 @@ void ImageManager::device_load_image(Device *device,
/* Slot assignment */
int flat_slot = type_index_to_flattened_slot(slot, type);
- string name;
- if(flat_slot >= 100)
- name = string_printf("__tex_image_%s_%d", name_from_type(type).c_str(), flat_slot);
- else if(flat_slot >= 10)
- name = string_printf("__tex_image_%s_0%d", name_from_type(type).c_str(), flat_slot);
- else
- name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot);
+ string name = string_printf("__tex_image_%s_%03d", name_from_type(type).c_str(), flat_slot);
if(type == IMAGE_DATA_TYPE_FLOAT4) {
- device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
+ if(dscene->tex_float4_image[slot] == NULL)
+ dscene->tex_float4_image[slot] = new device_vector<float4>();
+ device_vector<float4>& tex_img = *dscene->tex_float4_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -705,7 +751,9 @@ void ImageManager::device_load_image(Device *device,
}
}
else if(type == IMAGE_DATA_TYPE_FLOAT) {
- device_vector<float>& tex_img = dscene->tex_float_image[slot];
+ if(dscene->tex_float_image[slot] == NULL)
+ dscene->tex_float_image[slot] = new device_vector<float>();
+ device_vector<float>& tex_img = *dscene->tex_float_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -732,7 +780,9 @@ void ImageManager::device_load_image(Device *device,
}
}
else if(type == IMAGE_DATA_TYPE_BYTE4) {
- device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
+ if(dscene->tex_byte4_image[slot] == NULL)
+ dscene->tex_byte4_image[slot] = new device_vector<uchar4>();
+ device_vector<uchar4>& tex_img = *dscene->tex_byte4_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -762,7 +812,9 @@ void ImageManager::device_load_image(Device *device,
}
}
else if(type == IMAGE_DATA_TYPE_BYTE){
- device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
+ if(dscene->tex_byte_image[slot] == NULL)
+ dscene->tex_byte_image[slot] = new device_vector<uchar>();
+ device_vector<uchar>& tex_img = *dscene->tex_byte_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -788,7 +840,9 @@ void ImageManager::device_load_image(Device *device,
}
}
else if(type == IMAGE_DATA_TYPE_HALF4){
- device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
+ if(dscene->tex_half4_image[slot] == NULL)
+ dscene->tex_half4_image[slot] = new device_vector<half4>();
+ device_vector<half4>& tex_img = *dscene->tex_half4_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -817,7 +871,9 @@ void ImageManager::device_load_image(Device *device,
}
}
else if(type == IMAGE_DATA_TYPE_HALF){
- device_vector<half>& tex_img = dscene->tex_half_image[slot];
+ if(dscene->tex_half_image[slot] == NULL)
+ dscene->tex_half_image[slot] = new device_vector<half>();
+ device_vector<half>& tex_img = *dscene->tex_half_image[slot];
if(tex_img.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -857,69 +913,100 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD
((OSL::TextureSystem*)osl_texture_system)->invalidate(filename);
#endif
}
- else if(type == IMAGE_DATA_TYPE_FLOAT4) {
- device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
- }
-
- tex_img.clear();
- }
- else if(type == IMAGE_DATA_TYPE_FLOAT) {
- device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
- }
-
- tex_img.clear();
- }
- else if(type == IMAGE_DATA_TYPE_BYTE4) {
- device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
- }
-
- tex_img.clear();
- }
- else if(type == IMAGE_DATA_TYPE_BYTE){
- device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
- }
-
- tex_img.clear();
- }
- else if(type == IMAGE_DATA_TYPE_HALF4){
- device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
+ else {
+ device_memory *tex_img = NULL;
+ switch(type) {
+ case IMAGE_DATA_TYPE_FLOAT4:
+ if(slot >= dscene->tex_float4_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_float4_image[slot];
+ dscene->tex_float4_image[slot] = NULL;
+ break;
+ case IMAGE_DATA_TYPE_BYTE4:
+ if(slot >= dscene->tex_byte4_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_byte4_image[slot];
+ dscene->tex_byte4_image[slot]= NULL;
+ break;
+ case IMAGE_DATA_TYPE_HALF4:
+ if(slot >= dscene->tex_half4_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_half4_image[slot];
+ dscene->tex_half4_image[slot]= NULL;
+ break;
+ case IMAGE_DATA_TYPE_FLOAT:
+ if(slot >= dscene->tex_float_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_float_image[slot];
+ dscene->tex_float_image[slot] = NULL;
+ break;
+ case IMAGE_DATA_TYPE_BYTE:
+ if(slot >= dscene->tex_byte_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_byte_image[slot];
+ dscene->tex_byte_image[slot]= NULL;
+ break;
+ case IMAGE_DATA_TYPE_HALF:
+ if(slot >= dscene->tex_half_image.size()) {
+ break;
+ }
+ tex_img = dscene->tex_half_image[slot];
+ dscene->tex_half_image[slot]= NULL;
+ break;
+ default:
+ assert(0);
+ tex_img = NULL;
}
+ if(tex_img) {
+ if(tex_img->device_pointer) {
+ thread_scoped_lock device_lock(device_mutex);
+ device->tex_free(*tex_img);
+ }
- tex_img.clear();
- }
- else if(type == IMAGE_DATA_TYPE_HALF){
- device_vector<half>& tex_img = dscene->tex_half_image[slot];
-
- if(tex_img.device_pointer) {
- thread_scoped_lock device_lock(device_mutex);
- device->tex_free(tex_img);
+ delete tex_img;
}
-
- tex_img.clear();
}
delete images[type][slot];
images[type][slot] = NULL;
+ --tex_num_images[type];
+ }
+}
+
+void ImageManager::device_prepare_update(DeviceScene *dscene)
+{
+ for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+ switch(type) {
+ case IMAGE_DATA_TYPE_FLOAT4:
+ if(dscene->tex_float4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT4])
+ dscene->tex_float4_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT4]);
+ break;
+ case IMAGE_DATA_TYPE_BYTE4:
+ if(dscene->tex_byte4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE4])
+ dscene->tex_byte4_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE4]);
+ break;
+ case IMAGE_DATA_TYPE_HALF4:
+ if(dscene->tex_half4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF4])
+ dscene->tex_half4_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF4]);
+ break;
+ case IMAGE_DATA_TYPE_BYTE:
+ if(dscene->tex_byte_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE])
+ dscene->tex_byte_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE]);
+ break;
+ case IMAGE_DATA_TYPE_FLOAT:
+ if(dscene->tex_float_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT])
+ dscene->tex_float_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT]);
+ break;
+ case IMAGE_DATA_TYPE_HALF:
+ if(dscene->tex_half_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF])
+ dscene->tex_half_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF]);
+ break;
+ }
}
}
@@ -928,11 +1015,14 @@ void ImageManager::device_update(Device *device,
Scene *scene,
Progress& progress)
{
- if(!need_update)
+ if(!need_update) {
return;
+ }
- TaskPool pool;
+ /* Make sure arrays are proper size. */
+ device_prepare_update(dscene);
+ TaskPool pool;
for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
for(size_t slot = 0; slot < images[type].size(); slot++) {
if(!images[type][slot])
@@ -992,159 +1082,101 @@ void ImageManager::device_update_slot(Device *device,
uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
{
uint8_t options = 0;
-
/* Image Options are packed into one uint:
* bit 0 -> Interpolation
- * bit 1 + 2 + 3-> Extension */
- if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST)
+ * bit 1 + 2 + 3 -> Extension
+ */
+ if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) {
options |= (1 << 0);
-
- if(images[type][slot]->extension == EXTENSION_REPEAT)
+ }
+ if(images[type][slot]->extension == EXTENSION_REPEAT) {
options |= (1 << 1);
- else if(images[type][slot]->extension == EXTENSION_EXTEND)
+ }
+ else if(images[type][slot]->extension == EXTENSION_EXTEND) {
options |= (1 << 2);
- else /* EXTENSION_CLIP */
+ }
+ else /* EXTENSION_CLIP */ {
options |= (1 << 3);
-
+ }
return options;
}
-void ImageManager::device_pack_images(Device *device,
- DeviceScene *dscene,
- Progress& /*progess*/)
+template<typename T>
+void ImageManager::device_pack_images_type(
+ ImageDataType type,
+ const vector<device_vector<T>*>& cpu_textures,
+ device_vector<T> *device_image,
+ uint4 *info)
{
- /* For OpenCL, we pack all image textures into a single large texture, and
- * do our own interpolation in the kernel. */
size_t size = 0, offset = 0;
- ImageDataType type;
-
- int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4]
- + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE];
- uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
-
- /* Byte4 Textures*/
- type = IMAGE_DATA_TYPE_BYTE4;
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
- size += tex_img.size();
- }
-
- uchar4 *pixels_byte4 = dscene->tex_image_byte4_packed.resize(size);
-
+ /* First step is to calculate size of the texture we need. */
for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
- uint8_t options = pack_image_options(type, slot);
-
- int index = type_index_to_flattened_slot(slot, type) * 2;
- info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
- info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
- memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
- offset += tex_img.size();
- }
-
- /* Float4 Textures*/
- type = IMAGE_DATA_TYPE_FLOAT4;
- size = 0, offset = 0;
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
- size += tex_img.size();
- }
-
- float4 *pixels_float4 = dscene->tex_image_float4_packed.resize(size);
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
- /* todo: support 3D textures, only CPU for now */
-
- uint8_t options = pack_image_options(type, slot);
-
- int index = type_index_to_flattened_slot(slot, type) * 2;
- info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
- info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
- memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
- offset += tex_img.size();
- }
-
- /* Byte Textures*/
- type = IMAGE_DATA_TYPE_BYTE;
- size = 0, offset = 0;
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
+ if(images[type][slot] == NULL) {
continue;
-
- device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
+ }
+ device_vector<T>& tex_img = *cpu_textures[slot];
size += tex_img.size();
}
-
- uchar *pixels_byte = dscene->tex_image_byte_packed.resize(size);
-
+ /* Now we know how much memory we need, so we can allocate and fill. */
+ T *pixels = device_image->resize(size);
for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
+ if(images[type][slot] == NULL) {
continue;
-
- device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-
+ }
+ device_vector<T>& tex_img = *cpu_textures[slot];
uint8_t options = pack_image_options(type, slot);
-
- int index = type_index_to_flattened_slot(slot, type) * 2;
- info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+ const int index = type_index_to_flattened_slot(slot, type) * 2;
+ info[index] = make_uint4(tex_img.data_width,
+ tex_img.data_height,
+ offset,
+ options);
info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
- memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
+ memcpy(pixels + offset,
+ (void*)tex_img.data_pointer,
+ tex_img.memory_size());
offset += tex_img.size();
}
+}
- /* Float Textures*/
- type = IMAGE_DATA_TYPE_FLOAT;
- size = 0, offset = 0;
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<float>& tex_img = dscene->tex_float_image[slot];
- size += tex_img.size();
- }
-
- float *pixels_float = dscene->tex_image_float_packed.resize(size);
-
- for(size_t slot = 0; slot < images[type].size(); slot++) {
- if(!images[type][slot])
- continue;
-
- device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
- /* todo: support 3D textures, only CPU for now */
-
- uint8_t options = pack_image_options(type, slot);
-
- int index = type_index_to_flattened_slot(slot, type) * 2;
- info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
- info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
+void ImageManager::device_pack_images(Device *device,
+ DeviceScene *dscene,
+ Progress& /*progess*/)
+{
+ /* For OpenCL, we pack all image textures into a single large texture, and
+ * do our own interpolation in the kernel.
+ */
- memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
- offset += tex_img.size();
- }
+ /* TODO(sergey): This will over-allocate a bit, but this is constant memory
+ * so should be fine for a short term.
+ */
+ const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4),
+ max_flattened_slot(IMAGE_DATA_TYPE_BYTE4),
+ max_flattened_slot(IMAGE_DATA_TYPE_FLOAT),
+ max_flattened_slot(IMAGE_DATA_TYPE_BYTE));
+ uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
+ /* Pack byte4 textures. */
+ device_pack_images_type(IMAGE_DATA_TYPE_BYTE4,
+ dscene->tex_byte4_image,
+ &dscene->tex_image_byte4_packed,
+ info);
+ /* Pack float4 textures. */
+ device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4,
+ dscene->tex_float4_image,
+ &dscene->tex_image_float4_packed,
+ info);
+ /* Pack byte textures. */
+ device_pack_images_type(IMAGE_DATA_TYPE_BYTE,
+ dscene->tex_byte_image,
+ &dscene->tex_image_byte_packed,
+ info);
+ /* Pack float textures. */
+ device_pack_images_type(IMAGE_DATA_TYPE_FLOAT,
+ dscene->tex_float_image,
+ &dscene->tex_image_float_packed,
+ info);
+
+ /* Push textures to the device. */
if(dscene->tex_image_byte4_packed.size()) {
if(dscene->tex_image_byte4_packed.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
@@ -1201,16 +1233,23 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
images[type].clear();
}
- device->tex_free(dscene->tex_image_byte4_packed);
+ dscene->tex_float4_image.clear();
+ dscene->tex_byte4_image.clear();
+ dscene->tex_half4_image.clear();
+ dscene->tex_float_image.clear();
+ dscene->tex_byte_image.clear();
+ dscene->tex_half_image.clear();
+
device->tex_free(dscene->tex_image_float4_packed);
- device->tex_free(dscene->tex_image_byte_packed);
+ device->tex_free(dscene->tex_image_byte4_packed);
device->tex_free(dscene->tex_image_float_packed);
+ device->tex_free(dscene->tex_image_byte_packed);
device->tex_free(dscene->tex_image_packed_info);
- dscene->tex_image_byte4_packed.clear();
dscene->tex_image_float4_packed.clear();
- dscene->tex_image_byte_packed.clear();
+ dscene->tex_image_byte4_packed.clear();
dscene->tex_image_float_packed.clear();
+ dscene->tex_image_byte_packed.clear();
dscene->tex_image_packed_info.clear();
}
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 996b5a5b65f..db7e28a5e44 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -37,17 +37,6 @@ public:
explicit ImageManager(const DeviceInfo& info);
~ImageManager();
- enum ImageDataType {
- IMAGE_DATA_TYPE_FLOAT4 = 0,
- IMAGE_DATA_TYPE_BYTE4 = 1,
- IMAGE_DATA_TYPE_HALF4 = 2,
- IMAGE_DATA_TYPE_FLOAT = 3,
- IMAGE_DATA_TYPE_BYTE = 4,
- IMAGE_DATA_TYPE_HALF = 5,
-
- IMAGE_DATA_NUM_TYPES
- };
-
int add_image(const string& filename,
void *builtin_data,
bool animated,
@@ -68,8 +57,12 @@ public:
InterpolationType interpolation,
ExtensionType extension,
bool use_alpha);
- ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear);
+ ImageDataType get_image_metadata(const string& filename,
+ void *builtin_data,
+ bool& is_linear,
+ bool& builtin_free_cache);
+ void device_prepare_update(DeviceScene *dscene);
void device_update(Device *device,
DeviceScene *dscene,
Scene *scene,
@@ -98,19 +91,23 @@ public:
int &width,
int &height,
int &depth,
- int &channels)> builtin_image_info_cb;
+ int &channels,
+ bool &free_cache)> builtin_image_info_cb;
function<bool(const string &filename,
void *data,
unsigned char *pixels,
- const size_t pixels_size)> builtin_image_pixels_cb;
+ const size_t pixels_size,
+ const bool free_cache)> builtin_image_pixels_cb;
function<bool(const string &filename,
void *data,
float *pixels,
- const size_t pixels_size)> builtin_image_float_pixels_cb;
+ const size_t pixels_size,
+ const bool free_cache)> builtin_image_float_pixels_cb;
struct Image {
string filename;
void *builtin_data;
+ bool builtin_free_cache;
bool use_alpha;
bool need_load;
@@ -124,7 +121,9 @@ public:
private:
int tex_num_images[IMAGE_DATA_NUM_TYPES];
- int tex_start_images[IMAGE_DATA_NUM_TYPES];
+ int max_num_images;
+ bool has_half_images;
+ bool cuda_fermi_limits;
thread_mutex device_mutex;
int animation_frame;
@@ -133,7 +132,12 @@ private:
void *osl_texture_system;
bool pack_images;
- bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
+ bool file_load_image_generic(Image *img,
+ ImageInput **in,
+ int &width,
+ int &height,
+ int &depth,
+ int &components);
template<TypeDesc::BASETYPE FileFormat,
typename StorageType,
@@ -143,6 +147,7 @@ private:
int texture_limit,
device_vector<DeviceType>& tex_img);
+ int max_flattened_slot(ImageDataType type);
int type_index_to_flattened_slot(int slot, ImageDataType type);
int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
string name_from_type(int type);
@@ -160,6 +165,13 @@ private:
ImageDataType type,
int slot);
+ template<typename T>
+ void device_pack_images_type(
+ ImageDataType type,
+ const vector<device_vector<T>*>& cpu_textures,
+ device_vector<T> *device_image,
+ uint4 *info);
+
void device_pack_images(Device *device,
DeviceScene *dscene,
Progress& progess);
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 4886dcd563f..93d88c5642c 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -224,6 +224,10 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
bool LightManager::object_usable_as_light(Object *object) {
Mesh *mesh = object->mesh;
+ /* Skip objects with NaNs */
+ if (!object->bounds.valid()) {
+ return false;
+ }
/* Skip if we are not visible for BSDFs. */
if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
return false;
@@ -486,18 +490,10 @@ static void background_cdf(int start,
float2 *cond_cdf)
{
/* Conditional CDFs (rows, U direction). */
- /* NOTE: It is possible to have some NaN pixels on background
- * which will ruin CDF causing wrong shading. We replace such
- * pixels with black.
- */
for(int i = start; i < end; i++) {
float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
float3 env_color = (*pixels)[i * res];
float ave_luminance = average(env_color);
- /* TODO(sergey): Consider adding average_safe(). */
- if(!isfinite(ave_luminance)) {
- ave_luminance = 0.0f;
- }
cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
cond_cdf[i * cdf_count].y = 0.0f;
@@ -505,9 +501,6 @@ static void background_cdf(int start,
for(int j = 1; j < res; j++) {
env_color = (*pixels)[i * res + j];
ave_luminance = average(env_color);
- if(!isfinite(ave_luminance)) {
- ave_luminance = 0.0f;
- }
cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index a4dc06c4345..03825f780e0 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
float3 vNi = vN[i];
if(do_transform)
- vNi = normalize(transform_direction(&ntfm, vNi));
+ vNi = safe_normalize(transform_direction(&ntfm, vNi));
vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
}
@@ -1944,6 +1944,7 @@ void MeshManager::device_update_displacement_images(Device *device,
}
}
}
+ image_manager->device_prepare_update(dscene);
foreach(int slot, bump_images) {
pool.push(function_bind(&ImageManager::device_update_slot,
image_manager,
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index cf28bb16bb7..4ca20cf7ef3 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -169,6 +169,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
if(!done[t.v[j]]) {
done[t.v[j]] = true;
float3 off = float4_to_float3(offset[k++]);
+ /* Avoid illegal vertex coordinates. */
+ off = ensure_finite3(off);
mesh->verts[t.v[j]] += off;
if(attr_mP != NULL) {
for(int step = 0; step < mesh->motion_steps - 1; step++) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 1070e05a03b..90a68a06cb5 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -364,9 +364,10 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
image_manager = compiler.image_manager;
if(is_float == -1) {
if(builtin_data == NULL) {
- ImageManager::ImageDataType type;
- type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
- if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
+ ImageDataType type;
+ bool builtin_free_cache;
+ type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
+ if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
is_float = 1;
}
else {
@@ -553,9 +554,10 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
image_manager = compiler.image_manager;
if(is_float == -1) {
if(builtin_data == NULL) {
- ImageManager::ImageDataType type;
- type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
- if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
+ ImageDataType type;
+ bool builtin_free_cache;
+ type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
+ if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
is_float = 1;
}
else {
@@ -1791,12 +1793,19 @@ void ConvertNode::compile(OSLCompiler& compiler)
assert(0);
}
+/* Base type for all closure-type nodes */
+
+BsdfBaseNode::BsdfBaseNode(const NodeType *node_type)
+ : ShaderNode(node_type)
+{
+ special_type = SHADER_SPECIAL_TYPE_CLOSURE;
+}
+
/* BSDF Closure */
BsdfNode::BsdfNode(const NodeType *node_type)
-: ShaderNode(node_type)
+: BsdfBaseNode(node_type)
{
- special_type = SHADER_SPECIAL_TYPE_CLOSURE;
}
void BsdfNode::compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3, ShaderInput *param4)
@@ -2286,6 +2295,155 @@ void DiffuseBsdfNode::compile(OSLCompiler& compiler)
compiler.add(this, "node_diffuse_bsdf");
}
+/* Disney principled BSDF Closure */
+NODE_DEFINE(PrincipledBsdfNode)
+{
+ NodeType* type = NodeType::add("principled_bsdf", create, NodeType::SHADER);
+
+ static NodeEnum distribution_enum;
+ distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID);
+ distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+ SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+ SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
+ SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
+ SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
+ SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
+ SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+ SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
+ SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
+ SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
+ SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f);
+ SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f);
+ SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f);
+ SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f);
+ SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f);
+ SOCKET_IN_FLOAT(ior, "IOR", 0.0f);
+ SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
+ SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
+ SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f);
+ SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+ SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+ SOCKET_IN_NORMAL(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT);
+ SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
+
+ SOCKET_OUT_CLOSURE(BSDF, "BSDF");
+
+ return type;
+}
+
+PrincipledBsdfNode::PrincipledBsdfNode()
+ : BsdfBaseNode(node_type)
+{
+ closure = CLOSURE_BSDF_PRINCIPLED_ID;
+ distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+ distribution_orig = NBUILTIN_CLOSURES;
+}
+
+bool PrincipledBsdfNode::has_surface_bssrdf()
+{
+ ShaderInput *subsurface_in = input("Subsurface");
+ return (subsurface_in->link != NULL || subsurface > CLOSURE_WEIGHT_CUTOFF);
+}
+
+void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+ if(shader->has_surface) {
+ ShaderInput *tangent_in = input("Tangent");
+
+ if(!tangent_in->link)
+ attributes->add(ATTR_STD_GENERATED);
+ }
+
+ ShaderNode::attributes(shader, attributes);
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius,
+ ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic,
+ ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness,
+ ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness)
+{
+ ShaderInput *base_color_in = input("Base Color");
+ ShaderInput *subsurface_color_in = input("Subsurface Color");
+ ShaderInput *normal_in = input("Normal");
+ ShaderInput *clearcoat_normal_in = input("Clearcoat Normal");
+ ShaderInput *tangent_in = input("Tangent");
+
+ float3 weight = make_float3(1.0f, 1.0f, 1.0f);
+
+ compiler.add_node(NODE_CLOSURE_SET_WEIGHT, weight);
+
+ int normal_offset = compiler.stack_assign_if_linked(normal_in);
+ int clearcoat_normal_offset = compiler.stack_assign_if_linked(clearcoat_normal_in);
+ int tangent_offset = compiler.stack_assign_if_linked(tangent_in);
+ int specular_offset = compiler.stack_assign(p_specular);
+ int roughness_offset = compiler.stack_assign(p_roughness);
+ int specular_tint_offset = compiler.stack_assign(p_specular_tint);
+ int anisotropic_offset = compiler.stack_assign(p_anisotropic);
+ int sheen_offset = compiler.stack_assign(p_sheen);
+ int sheen_tint_offset = compiler.stack_assign(p_sheen_tint);
+ int clearcoat_offset = compiler.stack_assign(p_clearcoat);
+ int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness);
+ int ior_offset = compiler.stack_assign(p_ior);
+ int transmission_offset = compiler.stack_assign(p_transmission);
+ int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
+ int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
+ int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+
+ compiler.add_node(NODE_CLOSURE_BSDF,
+ compiler.encode_uchar4(closure,
+ compiler.stack_assign(p_metallic),
+ compiler.stack_assign(p_subsurface),
+ compiler.closure_mix_weight_offset()),
+ __float_as_int((p_metallic) ? get_float(p_metallic->socket_type) : 0.0f),
+ __float_as_int((p_subsurface) ? get_float(p_subsurface->socket_type) : 0.0f));
+
+ compiler.add_node(normal_offset, tangent_offset,
+ compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset),
+ compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset));
+
+ compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset),
+ distribution, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+ float3 bc_default = get_float3(base_color_in->socket_type);
+
+ compiler.add_node(((base_color_in->link) ? compiler.stack_assign(base_color_in) : SVM_STACK_INVALID),
+ __float_as_int(bc_default.x), __float_as_int(bc_default.y), __float_as_int(bc_default.z));
+
+ compiler.add_node(clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+ float3 ss_default = get_float3(subsurface_color_in->socket_type);
+
+ compiler.add_node(((subsurface_color_in->link) ? compiler.stack_assign(subsurface_color_in) : SVM_STACK_INVALID),
+ __float_as_int(ss_default.x), __float_as_int(ss_default.y), __float_as_int(ss_default.z));
+}
+
+bool PrincipledBsdfNode::has_integrator_dependency()
+{
+ ShaderInput *roughness_input = input("Roughness");
+ return !roughness_input->link && roughness <= 1e-4f;
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler)
+{
+ compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"),
+ input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"),
+ input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"),
+ input("Anisotropic Rotation"), input("Transmission Roughness"));
+}
+
+void PrincipledBsdfNode::compile(OSLCompiler& compiler)
+{
+ compiler.parameter(this, "distribution");
+ compiler.add(this, "node_principled_bsdf");
+}
+
+bool PrincipledBsdfNode::has_bssrdf_bump()
+{
+ /* detect if anything is plugged into the normal input besides the default */
+ ShaderInput *normal_in = input("Normal");
+ return (normal_in->link && normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY);
+}
+
/* Translucent BSDF Closure */
NODE_DEFINE(TranslucentBsdfNode)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index a755b653a5b..c0271a3c8eb 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -252,6 +252,7 @@ public:
class PointDensityTextureNode : public ShaderNode {
public:
SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
+ virtual int get_group() { return NODE_GROUP_LEVEL_3; }
~PointDensityTextureNode();
ShaderNode *clone() const;
@@ -321,7 +322,14 @@ private:
static bool initialized;
};
-class BsdfNode : public ShaderNode {
+class BsdfBaseNode : public ShaderNode {
+public:
+ BsdfBaseNode(const NodeType *node_type);
+
+ ClosureType closure;
+};
+
+class BsdfNode : public BsdfBaseNode {
public:
explicit BsdfNode(const NodeType *node_type);
SHADER_NODE_BASE_CLASS(BsdfNode)
@@ -333,7 +341,6 @@ public:
float3 color;
float3 normal;
float surface_mix_weight;
- ClosureType closure;
virtual bool equals(const ShaderNode& /*other*/)
{
@@ -361,6 +368,39 @@ public:
float roughness;
};
+/* Disney principled BRDF */
+class PrincipledBsdfNode : public BsdfBaseNode {
+public:
+ SHADER_NODE_CLASS(PrincipledBsdfNode)
+
+ bool has_spatial_varying() { return true; }
+ bool has_surface_bssrdf();
+ bool has_bssrdf_bump();
+ void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
+ ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic,
+ ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness,
+ ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness);
+
+ float3 base_color;
+ float3 subsurface_color, subsurface_radius;
+ float metallic, subsurface, specular, roughness, specular_tint, anisotropic,
+ sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission,
+ anisotropic_rotation, transmission_roughness;
+ float3 normal, clearcoat_normal, tangent;
+ float surface_mix_weight;
+ ClosureType distribution, distribution_orig;
+
+ virtual bool equals(const ShaderNode * /*other*/)
+ {
+ /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
+ return false;
+ }
+
+ ClosureType get_closure_type() { return closure; }
+ bool has_integrator_dependency();
+ void attributes(Shader *shader, AttributeRequestSet *attributes);
+};
+
class TranslucentBsdfNode : public BsdfNode {
public:
SHADER_NODE_CLASS(TranslucentBsdfNode)
@@ -445,6 +485,7 @@ public:
virtual ClosureType get_closure_type() { return CLOSURE_EMISSION_ID; }
bool has_surface_emission() { return true; }
+ bool has_volume_support() { return true; }
float3 color;
float strength;
@@ -496,6 +537,7 @@ public:
return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
}
virtual ClosureType get_closure_type() { return closure; }
+ virtual bool has_volume_support() { return true; }
float3 color;
float density;
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 6bff29d1c76..a794f233718 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -156,6 +156,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
og->surface_state.clear();
og->volume_state.clear();
og->displacement_state.clear();
+ og->bump_state.clear();
og->background_state.reset();
}
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 2b5267642a2..4c2c4f5fcc3 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -114,18 +114,18 @@ public:
device_vector<uint> sobol_directions;
/* cpu images */
- device_vector<uchar4> tex_byte4_image[TEX_NUM_BYTE4_CPU];
- device_vector<float4> tex_float4_image[TEX_NUM_FLOAT4_CPU];
- device_vector<float> tex_float_image[TEX_NUM_FLOAT_CPU];
- device_vector<uchar> tex_byte_image[TEX_NUM_BYTE_CPU];
- device_vector<half4> tex_half4_image[TEX_NUM_HALF4_CPU];
- device_vector<half> tex_half_image[TEX_NUM_HALF_CPU];
+ vector<device_vector<float4>* > tex_float4_image;
+ vector<device_vector<uchar4>* > tex_byte4_image;
+ vector<device_vector<half4>* > tex_half4_image;
+ vector<device_vector<float>* > tex_float_image;
+ vector<device_vector<uchar>* > tex_byte_image;
+ vector<device_vector<half>* > tex_half_image;
/* opencl images */
- device_vector<uchar4> tex_image_byte4_packed;
device_vector<float4> tex_image_float4_packed;
- device_vector<uchar> tex_image_byte_packed;
+ device_vector<uchar4> tex_image_byte4_packed;
device_vector<float> tex_image_float_packed;
+ device_vector<uchar> tex_image_byte_packed;
device_vector<uint4> tex_image_packed_info;
KernelData data;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c9b5547b407..8622318858e 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -114,8 +114,9 @@ Session::~Session()
}
/* clean up */
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &rtile, render_tiles)
+ delete rtile.buffers;
+ tile_manager.free_device();
delete buffers;
delete display;
@@ -268,8 +269,8 @@ void Session::run_gpu()
/* update status and timing */
update_status_time();
- /* path trace */
- path_trace();
+ /* render */
+ render();
device->task_wait();
@@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
thread_scoped_lock tile_lock(tile_mutex);
/* get next tile from manager */
- Tile tile;
+ Tile *tile;
int device_num = device->device_number(tile_device);
if(!tile_manager.next_tile(tile, device_num))
return false;
/* fill render tile */
- rtile.x = tile_manager.state.buffer.full_x + tile.x;
- rtile.y = tile_manager.state.buffer.full_y + tile.y;
- rtile.w = tile.w;
- rtile.h = tile.h;
+ rtile.x = tile_manager.state.buffer.full_x + tile->x;
+ rtile.y = tile_manager.state.buffer.full_y + tile->y;
+ rtile.w = tile->w;
+ rtile.h = tile->h;
rtile.start_sample = tile_manager.state.sample;
rtile.num_samples = tile_manager.state.num_samples;
rtile.resolution = tile_manager.state.resolution_divider;
+ rtile.tile_index = tile->index;
+ rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
tile_lock.unlock();
@@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
rtile.buffer = buffers->buffer.device_pointer;
rtile.rng_state = buffers->rng_state.device_pointer;
rtile.buffers = buffers;
+ tile->buffers = buffers;
device->map_tile(tile_device, rtile);
return true;
}
- /* fill buffer parameters */
- BufferParams buffer_params = tile_manager.params;
- buffer_params.full_x = rtile.x;
- buffer_params.full_y = rtile.y;
- buffer_params.width = rtile.w;
- buffer_params.height = rtile.h;
-
- buffer_params.get_offset_stride(rtile.offset, rtile.stride);
-
- RenderBuffers *tilebuffers;
+ bool store_rtile = false;
+ if(tile->buffers == NULL) {
+ /* fill buffer parameters */
+ BufferParams buffer_params = tile_manager.params;
+ buffer_params.full_x = rtile.x;
+ buffer_params.full_y = rtile.y;
+ buffer_params.width = rtile.w;
+ buffer_params.height = rtile.h;
+
+ /* allocate buffers */
+ if(params.progressive_refine) {
+ tile_lock.lock();
+
+ if(render_tiles.size() == 0) {
+ RenderTile nulltile;
+ nulltile.buffers = NULL;
+ render_tiles.resize(tile_manager.state.num_tiles, nulltile);
+ }
- /* allocate buffers */
- if(params.progressive_refine) {
- tile_lock.lock();
+ /* In certain circumstances number of tiles in the tile manager could
+ * be changed. This is not supported by the progressive refine feature.
+ */
+ assert(render_tiles.size() == tile_manager.state.num_tiles);
- if(tile_buffers.size() == 0)
- tile_buffers.resize(tile_manager.state.num_tiles, NULL);
+ RenderTile &stored_rtile = render_tiles[tile->index];
+ if(stored_rtile.buffers == NULL) {
+ tile->buffers = new RenderBuffers(tile_device);
+ tile->buffers->reset(tile_device, buffer_params);
+ store_rtile = true;
+ }
+ else {
+ assert(rtile.x == stored_rtile.x &&
+ rtile.y == stored_rtile.y &&
+ rtile.w == stored_rtile.w &&
+ rtile.h == stored_rtile.h);
+ tile_lock.unlock();
+ tile->buffers = stored_rtile.buffers;
+ }
+ }
+ else {
+ tile->buffers = new RenderBuffers(tile_device);
- /* In certain circumstances number of tiles in the tile manager could
- * be changed. This is not supported by the progressive refine feature.
- */
- assert(tile_buffers.size() == tile_manager.state.num_tiles);
+ tile->buffers->reset(tile_device, buffer_params);
+ }
+ }
- tilebuffers = tile_buffers[tile.index];
- if(tilebuffers == NULL) {
- tilebuffers = new RenderBuffers(tile_device);
- tile_buffers[tile.index] = tilebuffers;
+ tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
- tilebuffers->reset(tile_device, buffer_params);
- }
+ rtile.buffer = tile->buffers->buffer.device_pointer;
+ rtile.rng_state = tile->buffers->rng_state.device_pointer;
+ rtile.buffers = tile->buffers;
+ rtile.sample = 0;
+ if(store_rtile) {
+ render_tiles[tile->index] = rtile;
tile_lock.unlock();
}
- else {
- tilebuffers = new RenderBuffers(tile_device);
-
- tilebuffers->reset(tile_device, buffer_params);
- }
-
- rtile.buffer = tilebuffers->buffer.device_pointer;
- rtile.rng_state = tilebuffers->rng_state.device_pointer;
- rtile.buffers = tilebuffers;
/* this will tag tile as IN PROGRESS in blender-side render pipeline,
* which is needed to highlight currently rendering tile before first
@@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile)
if(params.progressive_refine == false) {
/* todo: optimize this by making it thread safe and removing lock */
- update_render_tile_cb(rtile);
+ update_render_tile_cb(rtile, true);
}
}
@@ -460,20 +479,77 @@ void Session::release_tile(RenderTile& rtile)
{
thread_scoped_lock tile_lock(tile_mutex);
- progress.add_finished_tile();
+ progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
- if(write_render_tile_cb) {
- if(params.progressive_refine == false) {
- /* todo: optimize this by making it thread safe and removing lock */
- write_render_tile_cb(rtile);
+ bool delete_tile;
- delete rtile.buffers;
+ if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+ if(write_render_tile_cb && params.progressive_refine == false) {
+ write_render_tile_cb(rtile);
+ if(delete_tile) {
+ delete rtile.buffers;
+ tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
+ }
+ }
+ }
+ else {
+ if(update_render_tile_cb && params.progressive_refine == false) {
+ update_render_tile_cb(rtile, false);
}
}
update_status_time();
}
+void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+ thread_scoped_lock tile_lock(tile_mutex);
+
+ int center_idx = tiles[4].tile_index;
+ assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+ BufferParams buffer_params = tile_manager.params;
+ int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y,
+ buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height);
+
+ for(int dy = -1, i = 0; dy <= 1; dy++) {
+ for(int dx = -1; dx <= 1; dx++, i++) {
+ int px = tiles[4].x + dx*params.tile_size.x;
+ int py = tiles[4].y + dy*params.tile_size.y;
+ if(px >= image_region.x && py >= image_region.y &&
+ px < image_region.z && py < image_region.w) {
+ int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx;
+ Tile *tile = &tile_manager.state.tiles[tile_index];
+ assert(tile->buffers);
+
+ tiles[i].buffer = tile->buffers->buffer.device_pointer;
+ tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
+ tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
+ tiles[i].w = tile->w;
+ tiles[i].h = tile->h;
+ tiles[i].buffers = tile->buffers;
+
+ tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+ }
+ else {
+ tiles[i].buffer = (device_ptr)NULL;
+ tiles[i].buffers = NULL;
+ tiles[i].x = clamp(px, image_region.x, image_region.z);
+ tiles[i].y = clamp(py, image_region.y, image_region.w);
+ tiles[i].w = tiles[i].h = 0;
+ }
+ }
+ }
+
+ assert(tiles[4].buffers);
+ device->map_neighbor_tiles(tile_device, tiles);
+}
+
+void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+ thread_scoped_lock tile_lock(tile_mutex);
+ device->unmap_neighbor_tiles(tile_device, tiles);
+}
+
void Session::run_cpu()
{
bool tiles_written = false;
@@ -558,8 +634,8 @@ void Session::run_cpu()
/* update status and timing */
update_status_time();
- /* path trace */
- path_trace();
+ /* render */
+ render();
/* update status and timing */
update_status_time();
@@ -646,20 +722,25 @@ DeviceRequestedFeatures Session::get_requested_device_features()
requested_features.use_baking = bake_manager->get_baking();
requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
requested_features.use_transparent &= scene->integrator->transparent_shadows;
+ requested_features.use_denoising = params.use_denoising;
return requested_features;
}
-void Session::load_kernels()
+void Session::load_kernels(bool lock_scene)
{
- thread_scoped_lock scene_lock(scene->mutex);
+ thread_scoped_lock scene_lock;
+ if(lock_scene) {
+ scene_lock = thread_scoped_lock(scene->mutex);
+ }
+
+ DeviceRequestedFeatures requested_features = get_requested_device_features();
- if(!kernels_loaded) {
+ if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
progress.set_status("Loading render kernels (may take a few minutes the first time)");
scoped_timer timer;
- DeviceRequestedFeatures requested_features = get_requested_device_features();
VLOG(2) << "Requested features:\n" << requested_features;
if(!device->load_kernels(requested_features)) {
string message = device->error_message();
@@ -676,6 +757,7 @@ void Session::load_kernels()
VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
kernels_loaded = true;
+ loaded_kernel_features = requested_features;
}
}
@@ -744,10 +826,10 @@ void Session::reset(BufferParams& buffer_params, int samples)
if(params.progressive_refine) {
thread_scoped_lock buffers_lock(buffers_mutex);
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &rtile, render_tiles)
+ delete rtile.buffers;
- tile_buffers.clear();
+ render_tiles.clear();
}
}
@@ -826,6 +908,8 @@ void Session::update_scene()
/* update scene */
if(scene->need_update()) {
+ load_kernels(false);
+
progress.set_status("Updating Scene");
MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
}
@@ -836,7 +920,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
int progressive_sample = tile_manager.state.sample;
int num_samples = tile_manager.get_num_effective_samples();
- int tile = progress.get_finished_tiles();
+ int tile = progress.get_rendered_tiles();
int num_tiles = tile_manager.state.num_tiles;
/* update status */
@@ -844,11 +928,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
if(!params.progressive) {
const bool is_cpu = params.device.type == DEVICE_CPU;
+ const bool rendering_finished = (tile == num_tiles);
const bool is_last_tile = (tile + 1) == num_tiles;
substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
- if(device->show_samples() || (is_cpu && is_last_tile)) {
+ if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
/* Some devices automatically support showing the sample number:
* - CUDADevice
* - OpenCLDevice when using the megakernel (the split kernel renders multiple
@@ -860,6 +945,9 @@ void Session::update_status_time(bool show_pause, bool show_done)
*/
substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
}
+ if(params.use_denoising) {
+ substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
+ }
}
else if(tile_manager.num_samples == INT_MAX)
substatus = string_printf("Path Tracing Sample %d", progressive_sample+1);
@@ -873,6 +961,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
}
else if(show_done) {
status = "Done";
+ progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */
}
else {
status = substatus;
@@ -882,13 +971,15 @@ void Session::update_status_time(bool show_pause, bool show_done)
progress.set_status(status, substatus);
}
-void Session::path_trace()
+void Session::render()
{
/* add path trace task */
- DeviceTask task(DeviceTask::PATH_TRACE);
+ DeviceTask task(DeviceTask::RENDER);
task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
task.release_tile = function_bind(&Session::release_tile, this, _1);
+ task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
+ task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
@@ -897,6 +988,18 @@ void Session::path_trace()
task.requested_tile_size = params.tile_size;
task.passes_size = tile_manager.params.get_passes_size();
+ if(params.use_denoising) {
+ task.denoising_radius = params.denoising_radius;
+ task.denoising_strength = params.denoising_strength;
+ task.denoising_feature_strength = params.denoising_feature_strength;
+ task.denoising_relative_pca = params.denoising_relative_pca;
+
+ assert(!scene->film->need_update);
+ task.pass_stride = scene->film->pass_stride;
+ task.pass_denoising_data = scene->film->denoising_data_offset;
+ task.pass_denoising_clean = scene->film->denoising_clean_offset;
+ }
+
device->task_add(task);
}
@@ -940,9 +1043,7 @@ bool Session::update_progressive_refine(bool cancel)
}
if(params.progressive_refine) {
- foreach(RenderBuffers *buffers, tile_buffers) {
- RenderTile rtile;
- rtile.buffers = buffers;
+ foreach(RenderTile &rtile, render_tiles) {
rtile.sample = sample;
if(write) {
@@ -951,7 +1052,7 @@ bool Session::update_progressive_refine(bool cancel)
}
else {
if(update_render_tile_cb)
- update_render_tile_cb(rtile);
+ update_render_tile_cb(rtile, true);
}
}
}
@@ -965,10 +1066,11 @@ void Session::device_free()
{
scene->device_free();
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &tile, render_tiles)
+ delete tile.buffers;
+ tile_manager.free_device();
- tile_buffers.clear();
+ render_tiles.clear();
/* used from background render only, so no need to
* re-create render/display buffers here
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index a7e5f78a64d..9f8bb8c42fa 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -57,6 +57,12 @@ public:
bool display_buffer_linear;
+ bool use_denoising;
+ int denoising_radius;
+ float denoising_strength;
+ float denoising_feature_strength;
+ bool denoising_relative_pca;
+
double cancel_timeout;
double reset_timeout;
double text_timeout;
@@ -77,6 +83,12 @@ public:
start_resolution = INT_MAX;
threads = 0;
+ use_denoising = false;
+ denoising_radius = 8;
+ denoising_strength = 0.0f;
+ denoising_feature_strength = 0.0f;
+ denoising_relative_pca = false;
+
display_buffer_linear = false;
cancel_timeout = 0.1;
@@ -126,7 +138,7 @@ public:
Stats stats;
function<void(RenderTile&)> write_render_tile_cb;
- function<void(RenderTile&)> update_render_tile_cb;
+ function<void(RenderTile&, bool)> update_render_tile_cb;
explicit Session(const SessionParams& params);
~Session();
@@ -141,7 +153,7 @@ public:
void set_pause(bool pause);
void update_scene();
- void load_kernels();
+ void load_kernels(bool lock_scene=true);
void device_free();
@@ -162,7 +174,7 @@ protected:
void update_status_time(bool show_pause = false, bool show_done = false);
void tonemap(int sample);
- void path_trace();
+ void render();
void reset_(BufferParams& params, int samples);
void run_cpu();
@@ -177,6 +189,9 @@ protected:
void update_tile_sample(RenderTile& tile);
void release_tile(RenderTile& tile);
+ void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+ void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+
bool device_use_gl;
thread *session_thread;
@@ -195,6 +210,7 @@ protected:
thread_mutex display_mutex;
bool kernels_loaded;
+ DeviceRequestedFeatures loaded_kernel_features;
double reset_time;
@@ -202,7 +218,7 @@ protected:
double last_update_time;
bool update_progressive_refine(bool cancel);
- vector<RenderBuffers *> tile_buffers;
+ vector<RenderTile> render_tiles;
DeviceRequestedFeatures get_requested_device_features();
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 23eee1916bd..44a266dfe18 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -49,6 +49,16 @@ static float beckmann_table_slope_max()
return 6.0;
}
+
+/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86
+ * see T50176 for details
+ */
+#if defined(_MSC_VER) && (_MSC_VER == 1900)
+# define MSVC_VOLATILE volatile
+#else
+# define MSVC_VOLATILE
+#endif
+
/* Paper used: Importance Sampling Microfacet-Based BSDFs with the
* Distribution of Visible Normals. Supplemental Material 2/2.
*
@@ -72,7 +82,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
slope_x[0] = (double)-beckmann_table_slope_max();
CDF_P22_omega_i[0] = 0;
- for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
+ for(MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
/* slope_x */
slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f));
@@ -116,6 +126,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
}
}
+#undef MSVC_VOLATILE
+
static void beckmann_table_build(vector<float>& table)
{
table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE);
@@ -178,6 +190,7 @@ Shader::Shader()
has_volume_spatial_varying = false;
has_object_dependency = false;
has_integrator_dependency = false;
+ has_volume_connected = false;
displacement_method = DISPLACE_BUMP;
@@ -229,6 +242,10 @@ void Shader::set_graph(ShaderGraph *graph_)
delete graph_bump;
graph = graph_;
graph_bump = NULL;
+
+ /* Store info here before graph optimization to make sure that
+ * nodes that get optimized away still count. */
+ has_volume_connected = (graph->output()->input("Volume")->link != NULL);
}
void Shader::tag_update(Scene *scene)
@@ -319,11 +336,14 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
(void)shadingsystem; /* Ignored when built without OSL. */
#ifdef WITH_OSL
- if(shadingsystem == SHADINGSYSTEM_OSL)
+ if(shadingsystem == SHADINGSYSTEM_OSL) {
manager = new OSLShaderManager();
+ }
else
#endif
+ {
manager = new SVMShaderManager();
+ }
add_default(scene);
@@ -420,15 +440,14 @@ void ShaderManager::device_update_common(Device *device,
flag |= SD_HAS_VOLUME;
has_volumes = true;
- /* in this case we can assume transparent surface */
- if(!shader->has_surface)
- flag |= SD_HAS_ONLY_VOLUME;
-
/* todo: this could check more fine grained, to skip useless volumes
* enclosed inside an opaque bsdf.
*/
flag |= SD_HAS_TRANSPARENT_SHADOW;
}
+ /* in this case we can assume transparent surface */
+ if(shader->has_volume_connected && !shader->has_surface)
+ flag |= SD_HAS_ONLY_VOLUME;
if(shader->heterogeneous_volume && shader->has_volume_spatial_varying)
flag |= SD_HETEROGENEOUS_VOLUME;
if(shader->has_bssrdf_bump)
@@ -569,6 +588,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
if(CLOSURE_IS_VOLUME(bsdf_node->closure)) {
requested_features->nodes_features |= NODE_FEATURE_VOLUME;
}
+ else if(CLOSURE_IS_PRINCIPLED(bsdf_node->closure)) {
+ requested_features->use_principled = true;
+ }
}
if(node->has_surface_bssrdf()) {
requested_features->use_subsurface = true;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index a8018231f1a..b6714b13247 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -105,6 +105,15 @@ public:
bool need_update;
bool need_update_attributes;
+ /* If the shader has only volume components, the surface is assumed to
+ * be transparent.
+ * However, graph optimization might remove the volume subgraph, but
+ * since the user connected something to the volume output the surface
+ * should still be transparent.
+ * Therefore, has_volume_connected stores whether some volume subtree
+ * was connected before optimization. */
+ bool has_volume_connected;
+
/* information about shader after compiling */
bool has_surface;
bool has_surface_emission;
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 944e746ca2d..176a1f4f0f3 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -25,37 +25,39 @@ namespace {
class TileComparator {
public:
- TileComparator(TileOrder order, int2 center)
- : order_(order),
- center_(center)
+ TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
+ : order(order_),
+ center(center_),
+ tiles(tiles_)
{}
- bool operator()(Tile &a, Tile &b)
+ bool operator()(int a, int b)
{
- switch(order_) {
+ switch(order) {
case TILE_CENTER:
{
- float2 dist_a = make_float2(center_.x - (a.x + a.w/2),
- center_.y - (a.y + a.h/2));
- float2 dist_b = make_float2(center_.x - (b.x + b.w/2),
- center_.y - (b.y + b.h/2));
+ float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2),
+ center.y - (tiles[a].y + tiles[a].h/2));
+ float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2),
+ center.y - (tiles[b].y + tiles[b].h/2));
return dot(dist_a, dist_a) < dot(dist_b, dist_b);
}
case TILE_LEFT_TO_RIGHT:
- return (a.x == b.x)? (a.y < b.y): (a.x < b.x);
+ return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x);
case TILE_RIGHT_TO_LEFT:
- return (a.x == b.x)? (a.y < b.y): (a.x > b.x);
+ return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x);
case TILE_TOP_TO_BOTTOM:
- return (a.y == b.y)? (a.x < b.x): (a.y > b.y);
+ return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y);
case TILE_BOTTOM_TO_TOP:
default:
- return (a.y == b.y)? (a.x < b.x): (a.y < b.y);
+ return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y);
}
}
protected:
- TileOrder order_;
- int2 center_;
+ TileOrder order;
+ int2 center;
+ Tile *tiles;
};
inline int2 hilbert_index_to_pos(int n, int d)
@@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
num_devices = num_devices_;
preserve_tile_device = preserve_tile_device_;
background = background_;
+ schedule_denoising = false;
range_start_sample = 0;
range_num_samples = -1;
@@ -108,6 +111,16 @@ TileManager::~TileManager()
{
}
+void TileManager::free_device()
+{
+ if(schedule_denoising) {
+ for(int i = 0; i < state.tiles.size(); i++) {
+ delete state.tiles[i].buffers;
+ state.tiles[i].buffers = NULL;
+ }
+ }
+}
+
static int get_divider(int w, int h, int start_resolution)
{
int divider = 1;
@@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
state.num_tiles = 0;
state.num_samples = 0;
state.resolution_divider = get_divider(params.width, params.height, start_resolution);
+ state.render_tiles.clear();
+ state.denoising_tiles.clear();
state.tiles.clear();
}
@@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_)
}
state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+ if(schedule_denoising) {
+ state.total_pixel_samples += params.width*params.height;
+ }
}
}
@@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced)
int image_h = max(1, params.height/resolution);
int2 center = make_int2(image_w/2, image_h/2);
- state.tiles.clear();
-
int num_logical_devices = preserve_tile_device? num_devices: 1;
int num = min(image_h, num_logical_devices);
int slice_num = sliced? num: 1;
- int tile_index = 0;
+ int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
state.tiles.clear();
- state.tiles.resize(num);
- vector<list<Tile> >::iterator tile_list = state.tiles.begin();
+ state.render_tiles.clear();
+ state.denoising_tiles.clear();
+ state.render_tiles.resize(num);
+ state.denoising_tiles.resize(num);
+ state.tile_stride = tile_w;
+ vector<list<int> >::iterator tile_list;
+ tile_list = state.render_tiles.begin();
if(tile_order == TILE_HILBERT_SPIRAL) {
assert(!sliced);
+ int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+ state.tiles.resize(tile_w*tile_h);
+
/* Size of blocks in tiles, must be a power of 2 */
const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4;
- int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
- int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y;
- int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+ int tiles_per_device = divide_up(tile_w * tile_h, num);
int cur_device = 0, cur_tiles = 0;
int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
/* Number of blocks to fill the image */
- int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x;
- int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y;
+ int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x);
+ int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y);
int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
/* Offset of spiral (to keep it centered) */
int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2);
@@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced)
if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
int w = min(tile_size.x, image_w - pos.x);
int h = min(tile_size.y, image_h - pos.y);
- tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device));
+ int2 ipos = pos / tile_size;
+ int idx = ipos.y*tile_w + ipos.x;
+ state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
+ tile_list->push_front(idx);
cur_tiles++;
- tile_index++;
if(cur_tiles == tiles_per_device) {
tile_list++;
@@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced)
break;
}
}
- return tile_index;
+ return tile_w*tile_h;
}
+ int idx = 0;
for(int slice = 0; slice < slice_num; slice++) {
int slice_y = (image_h/slice_num)*slice;
int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num;
- int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
- int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y;
+ int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y);
- int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+ int tiles_per_device = divide_up(tile_w * tile_h, num);
int cur_device = 0, cur_tiles = 0;
for(int tile_y = 0; tile_y < tile_h; tile_y++) {
- for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) {
+ for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
int x = tile_x * tile_size.x;
int y = tile_y * tile_size.y;
int w = (tile_x == tile_w-1)? image_w - x: tile_size.x;
int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y;
- tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device));
+ state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER));
+ tile_list->push_back(idx);
if(!sliced) {
cur_tiles++;
@@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced)
if(cur_tiles == tiles_per_device) {
/* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
if(tile_order != TILE_BOTTOM_TO_TOP) {
- tile_list->sort(TileComparator(tile_order, center));
+ tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
}
tile_list++;
cur_tiles = 0;
@@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced)
}
}
- return tile_index;
+ return idx;
}
void TileManager::set_tiles()
@@ -333,15 +358,111 @@ void TileManager::set_tiles()
state.buffer.full_height = max(1, params.full_height/resolution);
}
-bool TileManager::next_tile(Tile& tile, int device)
+int TileManager::get_neighbor_index(int index, int neighbor)
+{
+ static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+
+ int resolution = state.resolution_divider;
+ int image_w = max(1, params.width/resolution);
+ int image_h = max(1, params.height/resolution);
+ int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x);
+ int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y);
+
+ int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor];
+ if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+ return -1;
+
+ return ny*state.tile_stride + nx;
+}
+
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+{
+ if(index < 0 || state.tiles[index].state < min_state) {
+ return false;
+ }
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ /* Out-of-bounds tiles don't matter. */
+ if(nindex >= 0 && state.tiles[nindex].state < min_state) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
+bool TileManager::finish_tile(int index, bool &delete_tile)
+{
+ delete_tile = false;
+
+ switch(state.tiles[index].state) {
+ case Tile::RENDER:
+ {
+ if(!schedule_denoising) {
+ state.tiles[index].state = Tile::DONE;
+ delete_tile = true;
+ return true;
+ }
+ state.tiles[index].state = Tile::RENDERED;
+ /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ if(check_neighbor_state(nindex, Tile::RENDERED)) {
+ state.tiles[nindex].state = Tile::DENOISE;
+ state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
+ }
+ }
+ return false;
+ }
+ case Tile::DENOISE:
+ {
+ state.tiles[index].state = Tile::DENOISED;
+ /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ if(check_neighbor_state(nindex, Tile::DENOISED)) {
+ state.tiles[nindex].state = Tile::DONE;
+ /* It can happen that the tile just finished denoising and already can be freed here.
+ * However, in that case it still has to be written before deleting, so we can't delete it yet. */
+ if(neighbor == 8) {
+ delete_tile = true;
+ }
+ else {
+ delete state.tiles[nindex].buffers;
+ state.tiles[nindex].buffers = NULL;
+ }
+ }
+ }
+ return true;
+ }
+ default:
+ assert(false);
+ return true;
+ }
+}
+
+bool TileManager::next_tile(Tile* &tile, int device)
{
int logical_device = preserve_tile_device? device: 0;
- if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty())
+ if(logical_device >= state.render_tiles.size())
+ return false;
+
+ if(!state.denoising_tiles[logical_device].empty()) {
+ int idx = state.denoising_tiles[logical_device].front();
+ state.denoising_tiles[logical_device].pop_front();
+ tile = &state.tiles[idx];
+ return true;
+ }
+
+ if(state.render_tiles[logical_device].empty())
return false;
- tile = Tile(state.tiles[logical_device].front());
- state.tiles[logical_device].pop_front();
+ int idx = state.render_tiles[logical_device].front();
+ state.render_tiles[logical_device].pop_front();
+ tile = &state.tiles[idx];
return true;
}
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 622b89f7670..e39a8f0627a 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -31,12 +31,20 @@ public:
int index;
int x, y, w, h;
int device;
+ /* RENDER: The tile has to be rendered.
+ * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
+ * DENOISE: The tile can be denoised now.
+ * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
+ * DONE: The tile is finished and has been freed. */
+ typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
+ State state;
+ RenderBuffers *buffers;
Tile()
{}
- Tile(int index_, int x_, int y_, int w_, int h_, int device_)
- : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {}
+ Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
+ : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {}
};
/* Tile order */
@@ -58,6 +66,8 @@ public:
BufferParams params;
struct State {
+ vector<Tile> tiles;
+ int tile_stride;
BufferParams buffer;
int sample;
int num_samples;
@@ -67,9 +77,12 @@ public:
/* Total samples over all pixels: Generally num_samples*num_pixels,
* but can be higher due to the initial resolution division for previews. */
uint64_t total_pixel_samples;
- /* This vector contains a list of tiles for every logical device in the session.
- * In each list, the tiles are sorted according to the tile order setting. */
- vector<list<Tile> > tiles;
+
+ /* These lists contain the indices of the tiles to be rendered/denoised and are used
+ * when acquiring a new tile for the device.
+ * Each list in each vector is for one logical device. */
+ vector<list<int> > render_tiles;
+ vector<list<int> > denoising_tiles;
} state;
int num_samples;
@@ -78,10 +91,12 @@ public:
bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
~TileManager();
+ void free_device();
void reset(BufferParams& params, int num_samples);
void set_samples(int num_samples);
bool next();
- bool next_tile(Tile& tile, int device = 0);
+ bool next_tile(Tile* &tile, int device = 0);
+ bool finish_tile(int index, bool& delete_tile);
bool done();
void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; }
@@ -96,6 +111,9 @@ public:
/* Get number of actual samples to render. */
int get_num_effective_samples();
+
+ /* Schedule tiles for denoising after they've been rendered. */
+ bool schedule_denoising;
protected:
void set_tiles();
@@ -127,6 +145,9 @@ protected:
/* Generate tile list, return number of tiles. */
int gen_tiles(bool sliced);
+
+ int get_neighbor_index(int index, int neighbor);
+ bool check_neighbor_state(int index, Tile::State state);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 22ec8e0ee8e..6c059ba5d12 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both)
EXPECT_EQ(str, "foo bar zzz");
}
+TEST(util_string_remove_trademark, both_space)
+{
+ string str = string_remove_trademark("foo bar(TM) (R) zzz");
+ EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, both_space_around)
+{
+ string str = string_remove_trademark("foo bar (TM) (R) zzz");
+ EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, trademark_space_suffix)
+{
+ string str = string_remove_trademark("foo bar (TM)");
+ EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, trademark_space_middle)
+{
+ string str = string_remove_trademark("foo bar (TM) baz");
+ EXPECT_EQ(str, "foo bar baz");
+}
+
+
+TEST(util_string_remove_trademark, r_space_suffix)
+{
+ string str = string_remove_trademark("foo bar (R)");
+ EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, r_space_middle)
+{
+ string str = string_remove_trademark("foo bar (R) baz");
+ EXPECT_EQ(str, "foo bar baz");
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index a015fef8284..43f9a57d099 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -53,6 +53,13 @@ set(SRC_HEADERS
util_math_cdf.h
util_math_fast.h
util_math_intersect.h
+ util_math_float2.h
+ util_math_float3.h
+ util_math_float4.h
+ util_math_int2.h
+ util_math_int3.h
+ util_math_int4.h
+ util_math_matrix.h
util_md5.h
util_opengl.h
util_optimization.h
@@ -80,6 +87,32 @@ set(SRC_HEADERS
util_time.h
util_transform.h
util_types.h
+ util_types_float2.h
+ util_types_float2_impl.h
+ util_types_float3.h
+ util_types_float3_impl.h
+ util_types_float4.h
+ util_types_float4_impl.h
+ util_types_int2.h
+ util_types_int2_impl.h
+ util_types_int3.h
+ util_types_int3_impl.h
+ util_types_int4.h
+ util_types_int4_impl.h
+ util_types_uchar2.h
+ util_types_uchar2_impl.h
+ util_types_uchar3.h
+ util_types_uchar3_impl.h
+ util_types_uchar4.h
+ util_types_uchar4_impl.h
+ util_types_uint2.h
+ util_types_uint2_impl.h
+ util_types_uint3.h
+ util_types_uint3_impl.h
+ util_types_uint4.h
+ util_types_uint4_impl.h
+ util_types_vector3.h
+ util_types_vector3_impl.h
util_vector.h
util_version.h
util_view.h
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 6c52117ef9a..643af87a65f 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -35,6 +35,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
#define CCL_LOCAL_MEM_FENCE 0
#define ccl_barrier(flags) (void)0
@@ -68,6 +69,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
#define ccl_barrier(flags) barrier(flags)
@@ -79,7 +81,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
#define CCL_LOCAL_MEM_FENCE
#define ccl_barrier(flags) __syncthreads()
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 4d673dc34d8..c73beab98dc 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z)
0.055648f * x + -0.204043f * y + 1.057311f * z);
}
-#ifndef __KERNEL_OPENCL__
-
-ccl_device float3 color_srgb_to_scene_linear(float3 c)
-{
- return make_float3(
- color_srgb_to_scene_linear(c.x),
- color_srgb_to_scene_linear(c.y),
- color_srgb_to_scene_linear(c.z));
-}
-
#ifdef __KERNEL_SSE2__
/*
* Calculate initial guess for arg^exp based on float representation
@@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
ssef gte = fastpow24(gtebase);
return select(cmp, lt, gte);
}
-#endif
+#endif /* __KERNEL_SSE2__ */
-ccl_device float3 color_scene_linear_to_srgb(float3 c)
+ccl_device float3 color_srgb_to_scene_linear_v3(float3 c)
{
- return make_float3(
- color_scene_linear_to_srgb(c.x),
- color_scene_linear_to_srgb(c.y),
- color_scene_linear_to_srgb(c.z));
+ return make_float3(color_srgb_to_scene_linear(c.x),
+ color_srgb_to_scene_linear(c.y),
+ color_srgb_to_scene_linear(c.z));
}
+ccl_device float3 color_scene_linear_to_srgb_v3(float3 c)
+{
+ return make_float3(color_scene_linear_to_srgb(c.x),
+ color_scene_linear_to_srgb(c.y),
+ color_scene_linear_to_srgb(c.z));
+}
+
+ccl_device float4 color_srgb_to_scene_linear_v4(float4 c)
+{
+#ifdef __KERNEL_SSE2__
+ ssef r_ssef;
+ float4 &r = (float4 &)r_ssef;
+ r = c;
+ r_ssef = color_srgb_to_scene_linear(r_ssef);
+ r.w = c.w;
+ return r;
+#else
+ return make_float4(color_srgb_to_scene_linear(c.x),
+ color_srgb_to_scene_linear(c.y),
+ color_srgb_to_scene_linear(c.z),
+ c.w);
#endif
+}
ccl_device float linear_rgb_to_gray(float3 c)
{
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9cfa57dd741..10895f2e918 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -118,7 +118,7 @@ void DebugFlags::OpenCL::reset()
}
/* Initialize other flags from environment variables. */
debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
- single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL);
+ single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL);
}
DebugFlags::DebugFlags()
@@ -184,8 +184,8 @@ std::ostream& operator <<(std::ostream &os,
<< " Device type : " << opencl_device_type << "\n"
<< " Kernel type : " << opencl_kernel_type << "\n"
<< " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n"
- << " Signle program : " << string_from_bool(debug_flags.opencl.single_program)
- << "\n";
+ << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
+ << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
return os;
}
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 4505d584490..450cd900a9f 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -115,6 +115,10 @@ public:
/* Use single program */
bool single_program;
+
+ /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+ /* Artificial memory limit in bytes (0 if disabled). */
+ size_t mem_limit;
};
/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 5f9dcfb2481..1abcabd5294 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -50,9 +50,9 @@ public:
T *allocate(size_t n, const void *hint = 0)
{
+ (void)hint;
size_t size = n * sizeof(T);
util_guarded_mem_alloc(size);
- (void)hint;
if(n == 0) {
return NULL;
}
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index a5a3bd34fff..f38683bf7de 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -30,10 +30,10 @@ void util_logging_init(const char *argv0)
#ifdef WITH_CYCLES_LOGGING
using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
- /* Make it so FATAL messages are always print into console. */
+ /* Make it so ERROR messages are always print into console. */
char severity_fatal[32];
snprintf(severity_fatal, sizeof(severity_fatal), "%d",
- google::GLOG_FATAL);
+ google::GLOG_ERROR);
google::InitGoogleLogging(argv0);
SetCommandLineOption("logtostderr", "1");
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index ecf9c9cfee0..492f830e67c 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -19,28 +19,30 @@
#if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
# include <glog/logging.h>
-#else
-# include <iostream>
#endif
+#include <iostream>
+
CCL_NAMESPACE_BEGIN
#if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
-class StubStream : public std::ostream {
- public:
- StubStream() : std::ostream(NULL) { }
+class StubStream {
+public:
+ template<class T>
+ StubStream& operator<<(const T&) {
+ return *this;
+ }
};
class LogMessageVoidify {
public:
LogMessageVoidify() { }
- void operator&(::std::ostream&) { }
+ void operator&(StubStream&) { }
};
# define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
# define LOG(severity) LOG_SUPPRESS()
# define VLOG(severity) LOG_SUPPRESS()
-
#endif
#define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index e0305b978b9..b719640b19c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -28,12 +28,10 @@
#ifndef __KERNEL_OPENCL__
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-
-#endif
+# include <float.h>
+# include <math.h>
+# include <stdio.h>
+#endif /* __KERNEL_OPENCL__ */
#include "util/util_types.h"
@@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN
/* Division */
#ifndef M_PI_F
-#define M_PI_F (3.1415926535897932f) /* pi */
+# define M_PI_F (3.1415926535897932f) /* pi */
#endif
#ifndef M_PI_2_F
-#define M_PI_2_F (1.5707963267948966f) /* pi/2 */
+# define M_PI_2_F (1.5707963267948966f) /* pi/2 */
#endif
#ifndef M_PI_4_F
-#define M_PI_4_F (0.7853981633974830f) /* pi/4 */
+# define M_PI_4_F (0.7853981633974830f) /* pi/4 */
#endif
#ifndef M_1_PI_F
-#define M_1_PI_F (0.3183098861837067f) /* 1/pi */
+# define M_1_PI_F (0.3183098861837067f) /* 1/pi */
#endif
#ifndef M_2_PI_F
-#define M_2_PI_F (0.6366197723675813f) /* 2/pi */
+# define M_2_PI_F (0.6366197723675813f) /* 2/pi */
#endif
/* Multiplication */
#ifndef M_2PI_F
-#define M_2PI_F (6.2831853071795864f) /* 2*pi */
+# define M_2PI_F (6.2831853071795864f) /* 2*pi */
#endif
#ifndef M_4PI_F
-#define M_4PI_F (12.566370614359172f) /* 4*pi */
+# define M_4PI_F (12.566370614359172f) /* 4*pi */
#endif
/* Float sqrt variations */
-
#ifndef M_SQRT2_F
-#define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
+# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
#endif
-
#ifndef M_LN2_F
-#define M_LN2_F (0.6931471805599453f) /* ln(2) */
+# define M_LN2_F (0.6931471805599453f) /* ln(2) */
#endif
-
#ifndef M_LN10_F
-#define M_LN10_F (2.3025850929940457f) /* ln(10) */
+# define M_LN10_F (2.3025850929940457f) /* ln(10) */
#endif
/* Scalar */
#ifdef _WIN32
-
-#ifndef __KERNEL_OPENCL__
-
+# ifndef __KERNEL_OPENCL__
ccl_device_inline float fmaxf(float a, float b)
{
return (a > b)? a: b;
@@ -95,13 +88,10 @@ ccl_device_inline float fminf(float a, float b)
{
return (a < b)? a: b;
}
-
-#endif
-
-#endif
+# endif /* !__KERNEL_OPENCL__ */
+#endif /* _WIN32 */
#ifndef __KERNEL_GPU__
-
using std::isfinite;
using std::isnan;
@@ -157,8 +147,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d)
{
return max(max(a,b),max(c,d));
}
-
-#endif
+#endif /* __KERNEL_GPU__ */
ccl_device_inline float min4(float a, float b, float c, float d)
{
@@ -170,525 +159,141 @@ ccl_device_inline float max4(float a, float b, float c, float d)
return max(max(a, b), max(c, d));
}
-ccl_device_inline float max3(float3 a)
-{
- return max(max(a.x, a.y), a.z);
-}
-
#ifndef __KERNEL_OPENCL__
+/* Int/Float conversion */
-ccl_device_inline int clamp(int a, int mn, int mx)
-{
- return min(max(a, mn), mx);
-}
-
-ccl_device_inline float clamp(float a, float mn, float mx)
-{
- return min(max(a, mn), mx);
-}
-
-ccl_device_inline float mix(float a, float b, float t)
-{
- return a + t*(b - a);
-}
-
-#endif
-
-#ifndef __KERNEL_CUDA__
-
-ccl_device_inline float saturate(float a)
-{
- return clamp(a, 0.0f, 1.0f);
-}
-
-#endif
-
-ccl_device_inline int float_to_int(float f)
-{
- return (int)f;
-}
-
-ccl_device_inline int floor_to_int(float f)
-{
- return float_to_int(floorf(f));
-}
-
-ccl_device_inline int ceil_to_int(float f)
-{
- return float_to_int(ceilf(f));
-}
-
-ccl_device_inline float signf(float f)
-{
- return (f < 0.0f)? -1.0f: 1.0f;
-}
-
-ccl_device_inline float nonzerof(float f, float eps)
-{
- if(fabsf(f) < eps)
- return signf(f)*eps;
- else
- return f;
-}
-
-ccl_device_inline float smoothstepf(float f)
-{
- float ff = f*f;
- return (3.0f*ff - 2.0f*ff*f);
-}
-
-ccl_device_inline int mod(int x, int m)
-{
- return (x % m + m) % m;
-}
-
-/* Float2 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool is_zero(const float2& a)
-{
- return (a.x == 0.0f && a.y == 0.0f);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float average(const float2& a)
-{
- return (a.x + a.y)*(1.0f/2.0f);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float2 operator-(const float2& a)
-{
- return make_float2(-a.x, -a.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, const float2& b)
-{
- return make_float2(a.x*b.x, a.y*b.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, float f)
-{
- return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator*(float f, const float2& a)
-{
- return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator/(float f, const float2& a)
-{
- return make_float2(f/a.x, f/a.y);
-}
-
-ccl_device_inline float2 operator/(const float2& a, float f)
-{
- float invf = 1.0f/f;
- return make_float2(a.x*invf, a.y*invf);
-}
-
-ccl_device_inline float2 operator/(const float2& a, const float2& b)
+ccl_device_inline int as_int(uint i)
{
- return make_float2(a.x/b.x, a.y/b.y);
+ union { uint ui; int i; } u;
+ u.ui = i;
+ return u.i;
}
-ccl_device_inline float2 operator+(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(int i)
{
- return make_float2(a.x+b.x, a.y+b.y);
+ union { uint ui; int i; } u;
+ u.i = i;
+ return u.ui;
}
-ccl_device_inline float2 operator-(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(float f)
{
- return make_float2(a.x-b.x, a.y-b.y);
+ union { uint i; float f; } u;
+ u.f = f;
+ return u.i;
}
-ccl_device_inline float2 operator+=(float2& a, const float2& b)
+ccl_device_inline int __float_as_int(float f)
{
- return a = a + b;
+ union { int i; float f; } u;
+ u.f = f;
+ return u.i;
}
-ccl_device_inline float2 operator*=(float2& a, const float2& b)
+ccl_device_inline float __int_as_float(int i)
{
- return a = a * b;
+ union { int i; float f; } u;
+ u.i = i;
+ return u.f;
}
-ccl_device_inline float2 operator*=(float2& a, float f)
+ccl_device_inline uint __float_as_uint(float f)
{
- return a = a * f;
+ union { uint i; float f; } u;
+ u.f = f;
+ return u.i;
}
-ccl_device_inline float2 operator/=(float2& a, const float2& b)
+ccl_device_inline float __uint_as_float(uint i)
{
- return a = a / b;
+ union { uint i; float f; } u;
+ u.i = i;
+ return u.f;
}
+#endif /* __KERNEL_OPENCL__ */
-ccl_device_inline float2 operator/=(float2& a, float f)
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
{
- float invf = 1.0f/f;
- return a = a * invf;
+ unsigned int x = __float_as_uint(f);
+ return (x << 1) > 0xff000000u;
}
-
-ccl_device_inline float dot(const float2& a, const float2& b)
+ccl_device_inline bool isfinite_safe(float f)
{
- return a.x*b.x + a.y*b.y;
+ /* By IEEE 754 rule, 2*Inf equals Inf */
+ unsigned int x = __float_as_uint(f);
+ return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
}
-ccl_device_inline float cross(const float2& a, const float2& b)
+ccl_device_inline float ensure_finite(float v)
{
- return (a.x*b.y - a.y*b.x);
+ return isfinite_safe(v)? v : 0.0f;
}
-#endif
-
#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const int2 a, const int2 b)
-{
- return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline float len(const float2& a)
-{
- return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float2 normalize(const float2& a)
-{
- return a/len(a);
-}
-
-ccl_device_inline float2 normalize_len(const float2& a, float *t)
-{
- *t = len(a);
- return a/(*t);
-}
-
-ccl_device_inline float2 safe_normalize(const float2& a)
-{
- float t = len(a);
- return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline bool operator==(const float2& a, const float2& b)
-{
- return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline bool operator!=(const float2& a, const float2& b)
-{
- return !(a == b);
-}
-
-ccl_device_inline float2 min(const float2& a, const float2& b)
-{
- return make_float2(min(a.x, b.x), min(a.y, b.y));
-}
-
-ccl_device_inline float2 max(const float2& a, const float2& b)
-{
- return make_float2(max(a.x, b.x), max(a.y, b.y));
-}
-
-ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline float2 fabs(const float2& a)
-{
- return make_float2(fabsf(a.x), fabsf(a.y));
-}
-
-ccl_device_inline float2 as_float2(const float4& a)
-{
- return make_float2(a.x, a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float2(const char *label, const float2& a)
-{
- printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
-{
- return a + t*(b - a);
-}
-
-#endif
-
-/* Float3 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 operator-(const float3& a)
-{
-#ifdef __KERNEL_SSE__
- return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
- return make_float3(-a.x, -a.y, -a.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
- return float3(_mm_mul_ps(a.m128,b.m128));
-#else
- return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float f)
-{
-#ifdef __KERNEL_SSE__
- return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
-#else
- return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float f, const float3& a)
-{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
- return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
- return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float f, const float3& a)
-{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
- __m128 rc = _mm_rcp_ps(a.m128);
- return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
-#else
- return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float f)
-{
- float invf = 1.0f/f;
- return a * invf;
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float3& b)
-{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
- __m128 rc = _mm_rcp_ps(b.m128);
- return float3(_mm_mul_ps(a, rc));
-#else
- return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
- return float3(_mm_add_ps(a.m128, b.m128));
-#else
- return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
-}
-
-ccl_device_inline float3 operator-(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
- return float3(_mm_sub_ps(a.m128, b.m128));
-#else
- return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+=(float3& a, const float3& b)
-{
- return a = a + b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, const float3& b)
-{
- return a = a * b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, float f)
-{
- return a = a * f;
-}
-
-ccl_device_inline float3 operator/=(float3& a, const float3& b)
-{
- return a = a / b;
-}
-
-ccl_device_inline float3 operator/=(float3& a, float f)
-{
- float invf = 1.0f/f;
- return a = a * invf;
-}
-
-ccl_device_inline float dot(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else
- return a.x*b.x + a.y*b.y + a.z*b.z;
-#endif
-}
-
-ccl_device_inline float dot_xy(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
-#else
- return a.x*b.x + a.y*b.y;
-#endif
-}
-
-ccl_device_inline float dot(const float4& a, const float4& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#else
- return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
-#endif
-}
-
-ccl_device_inline float3 cross(const float3& a, const float3& b)
-{
- float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
- return r;
-}
-
-#endif
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
- return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
- return dot(a, a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float len_squared(const float4& a)
-{
- return dot(a, a);
-}
-
-ccl_device_inline float3 normalize(const float3& a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
- return _mm_div_ps(a.m128, norm);
-#else
- return a/len(a);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 saturate3(float3 a)
+ccl_device_inline float clamp(float a, float mn, float mx)
{
- return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+ return min(max(a, mn), mx);
}
-ccl_device_inline float3 normalize_len(const float3 a, float *t)
+ccl_device_inline float mix(float a, float b, float t)
{
- *t = len(a);
- float x = 1.0f / *t;
- return a*x;
+ return a + t*(b - a);
}
+#endif /* __KERNEL_OPENCL__ */
-ccl_device_inline float3 safe_normalize(const float3 a)
+#ifndef __KERNEL_CUDA__
+ccl_device_inline float saturate(float a)
{
- float t = len(a);
- return (t != 0.0f)? a * (1.0f/t) : a;
+ return clamp(a, 0.0f, 1.0f);
}
+#endif /* __KERNEL_CUDA__ */
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+ccl_device_inline int float_to_int(float f)
{
- *t = len(a);
- return (*t != 0.0f)? a/(*t): a;
+ return (int)f;
}
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const float3& a, const float3& b)
+ccl_device_inline int floor_to_int(float f)
{
-#ifdef __KERNEL_SSE__
- return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
- return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+ return float_to_int(floorf(f));
}
-ccl_device_inline bool operator!=(const float3& a, const float3& b)
+ccl_device_inline int ceil_to_int(float f)
{
- return !(a == b);
+ return float_to_int(ceilf(f));
}
-ccl_device_inline float3 min(const float3& a, const float3& b)
+ccl_device_inline float signf(float f)
{
-#ifdef __KERNEL_SSE__
- return _mm_min_ps(a.m128, b.m128);
-#else
- return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+ return (f < 0.0f)? -1.0f: 1.0f;
}
-ccl_device_inline float3 max(const float3& a, const float3& b)
+ccl_device_inline float nonzerof(float f, float eps)
{
-#ifdef __KERNEL_SSE__
- return _mm_max_ps(a.m128, b.m128);
-#else
- return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+ if(fabsf(f) < eps)
+ return signf(f)*eps;
+ else
+ return f;
}
-ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+ccl_device_inline float smoothstepf(float f)
{
- return min(max(a, mn), mx);
+ float ff = f*f;
+ return (3.0f*ff - 2.0f*ff*f);
}
-ccl_device_inline float3 fabs(const float3& a)
+ccl_device_inline int mod(int x, int m)
{
-#ifdef __KERNEL_SSE__
- __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
- return _mm_and_ps(a.m128, mask);
-#else
- return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+ return (x % m + m) % m;
}
-#endif
-
ccl_device_inline float3 float2_to_float3(const float2 a)
{
return make_float3(a.x, a.y, 0.0f);
@@ -704,546 +309,19 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
return make_float4(a.x, a.y, a.z, 1.0f);
}
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float3(const char *label, const float3& a)
-{
- printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
-}
-
-ccl_device_inline float3 rcp(const float3& a)
-{
-#ifdef __KERNEL_SSE__
- float4 r = _mm_rcp_ps(a.m128);
- return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
- return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 interp(float3 a, float3 b, float t)
-{
- return a + t*(b - a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
-{
- return a + t*(b - a);
-}
-
-#endif
-
-ccl_device_inline bool is_zero(const float3 a)
-{
-#ifdef __KERNEL_SSE__
- return a == make_float3(0.0f);
-#else
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float3 a)
-{
- return (a.x + a.y + a.z);
-}
-
-ccl_device_inline float average(const float3 a)
-{
- return reduce_add(a)*(1.0f/3.0f);
-}
-
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
-{
-#ifdef __KERNEL_OPENCL__
- return all(a == b);
-#else
- return a == b;
-#endif
-}
-
-/* Float4 Vector */
-
-#ifdef __KERNEL_SSE__
-
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
-{
- return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
-}
-
-#if defined(__KERNEL_SSE3__)
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
-{
- return _mm_moveldup_ps(b);
-}
-
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
-{
- return _mm_movehdup_ps(b);
-}
-#endif
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
-{
- return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float4 operator-(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
- return _mm_xor_ps(a.m128, mask);
-#else
- return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_mul_ps(a.m128, b.m128);
-#else
- return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, float f)
-{
-#if defined(__KERNEL_SSE__)
- return a * make_float4(f);
-#else
- return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
-#endif
-}
-
-ccl_device_inline float4 operator*(float f, const float4& a)
-{
- return a * f;
-}
-
-ccl_device_inline float4 rcp(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- float4 r = _mm_rcp_ps(a.m128);
- return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
- return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
-#endif
-}
-
-ccl_device_inline float4 operator/(const float4& a, float f)
-{
- return a * (1.0f/f);
-}
-
-ccl_device_inline float4 operator/(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return a * rcp(b);
-#else
- return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-#endif
-
-}
-
-ccl_device_inline float4 operator+(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_add_ps(a.m128, b.m128);
-#else
- return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline float4 operator-(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_sub_ps(a.m128, b.m128);
-#else
- return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-#endif
-}
-
-ccl_device_inline float4 operator+=(float4& a, const float4& b)
-{
- return a = a + b;
-}
-
-ccl_device_inline float4 operator*=(float4& a, const float4& b)
-{
- return a = a * b;
-}
-
-ccl_device_inline float4 operator/=(float4& a, float f)
-{
- return a = a / f;
-}
-
-ccl_device_inline int4 operator<(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
- return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
-}
-
-ccl_device_inline int4 operator>=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
- return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
-}
-
-ccl_device_inline int4 operator<=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
- return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
-}
-
-ccl_device_inline bool operator==(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
- return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
-}
-
-ccl_device_inline float4 cross(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
-#else
- return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
-#endif
-}
-
-ccl_device_inline bool is_zero(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- return a == make_float4(0.0f);
-#else
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- float4 h = shuffle<1,0,3,2>(a) + a;
- return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
-#else
- return ((a.x + a.y) + (a.z + a.w));
-#endif
-}
-
-ccl_device_inline float average(const float4& a)
-{
- return reduce_add(a) * 0.25f;
-}
-
-ccl_device_inline float len(const float4& a)
-{
- return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float4 normalize(const float4& a)
-{
- return a/len(a);
-}
-
-ccl_device_inline float4 safe_normalize(const float4& a)
-{
- float t = len(a);
- return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline float4 min(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_min_ps(a.m128, b.m128);
-#else
- return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline float4 max(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_max_ps(a.m128, b.m128);
-#else
- return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
-#else
- return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
-
-ccl_device_inline float4 reduce_min(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- float4 h = min(shuffle<1,0,3,2>(a), a);
- return min(shuffle<2,3,0,1>(h), h);
-#else
- return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-#endif
-}
-
-ccl_device_inline float4 reduce_max(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- float4 h = max(shuffle<1,0,3,2>(a), a);
- return max(shuffle<2,3,0,1>(h), h);
-#else
- return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-#endif
-}
-
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
- float4 h = shuffle<1,0,3,2>(a) + a;
- return shuffle<2,3,0,1>(h) + h;
-#else
- return make_float4((a.x + a.y) + (a.z + a.w));
-#endif
-}
-#endif
-
-ccl_device_inline void print_float4(const char *label, const float4& a)
-{
- printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
-}
-
-#endif
-
-/* Int2 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
-{
- return make_int2(a.x + b.x, a.y + b.y);
-}
-
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
-{
- return a = a + b;
-}
-
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
-{
- return make_int2(a.x - b.x, a.y - b.y);
-}
-
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
-{
- return make_int2(a.x * b.x, a.y * b.y);
-}
-
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
-{
- return make_int2(a.x / b.x, a.y / b.y);
-}
-
-#endif
-
-/* Int3 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int3 min(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
- return _mm_min_epi32(a.m128, b.m128);
-#else
- return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 max(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
- return _mm_max_epi32(a.m128, b.m128);
-#else
- return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
-{
-#ifdef __KERNEL_SSE__
- return min(max(a, make_int3(mn)), make_int3(mx));
-#else
- return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
-{
-#ifdef __KERNEL_SSE__
- return min(max(a, mn), make_int3(mx));
-#else
- return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_int3(const char *label, const int3& a)
-{
- printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
-}
-
-#endif
-
-/* Int4 */
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline int4 operator+(const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
- return _mm_add_epi32(a.m128, b.m128);
-#else
- return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline int4 operator+=(int4& a, const int4& b)
-{
- return a = a + b;
-}
-
-ccl_device_inline int4 operator>>(const int4& a, int i)
-{
-#ifdef __KERNEL_SSE__
- return _mm_srai_epi32(a.m128, i);
-#else
- return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
-#endif
-}
-
-ccl_device_inline int4 min(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
- return _mm_min_epi32(a.m128, b.m128);
-#else
- return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 max(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
- return _mm_max_epi32(a.m128, b.m128);
-#else
- return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
-{
- return min(max(a, mn), mx);
-}
-
-ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
- __m128 m = _mm_cvtepi32_ps(mask);
- return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
-#else
- return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
+CCL_NAMESPACE_END
-ccl_device_inline void print_int4(const char *label, const int4& a)
-{
- printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
-}
+#include "util/util_math_int2.h"
+#include "util/util_math_int3.h"
+#include "util/util_math_int4.h"
-#endif
+#include "util/util_math_float2.h"
+#include "util/util_math_float3.h"
+#include "util/util_math_float4.h"
-/* Int/Float conversion */
+CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int as_int(uint i)
-{
- union { uint ui; int i; } u;
- u.ui = i;
- return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
- union { uint ui; int i; } u;
- u.i = i;
- return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
- union { uint i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline int __float_as_int(float f)
-{
- union { int i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
- union { int i; float f; } u;
- u.i = i;
- return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
- union { uint i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
- union { uint i; float f; } u;
- u.i = i;
- return u.f;
-}
-
-
/* Interpolation */
template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -1253,26 +331,13 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
/* Triangle */
-ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
+ccl_device_inline float triangle_area(const float3& v1,
+ const float3& v2,
+ const float3& v3)
{
return len(cross(v3 - v2, v1 - v2))*0.5f;
}
-
-#endif
-
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
- unsigned int x = __float_as_uint(f);
- return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
- /* By IEEE 754 rule, 2*Inf equals Inf */
- unsigned int x = __float_as_uint(f);
- return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
-}
+#endif /* __KERNEL_OPENCL__ */
/* Orthonormal vectors */
@@ -1369,16 +434,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
float3 r;
r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
- (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
- (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
+ (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
+ (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
- ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
- (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
+ ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
+ (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
- (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
- ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
+ (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
+ ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
return r;
}
@@ -1427,17 +492,17 @@ ccl_device float safe_powf(float a, float b)
return compatible_powf(a, b);
}
-ccl_device float safe_logf(float a, float b)
+ccl_device float safe_divide(float a, float b)
{
- if(UNLIKELY(a < 0.0f || b < 0.0f))
- return 0.0f;
-
- return logf(a)/logf(b);
+ return (b != 0.0f)? a/b: 0.0f;
}
-ccl_device float safe_divide(float a, float b)
+ccl_device float safe_logf(float a, float b)
{
- return (b != 0.0f)? a/b: 0.0f;
+ if(UNLIKELY(a <= 0.0f || b <= 0.0f))
+ return 0.0f;
+
+ return safe_divide(logf(a),logf(b));
}
ccl_device float safe_modulo(float a, float b)
@@ -1493,31 +558,6 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
return make_float2(u, v);
}
-ccl_device_inline int util_max_axis(float3 vec)
-{
-#ifdef __KERNEL_SSE__
- __m128 a = shuffle<0,0,1,1>(vec.m128);
- __m128 b = shuffle<1,2,2,1>(vec.m128);
- __m128 c = _mm_cmpgt_ps(a, b);
- int mask = _mm_movemask_ps(c) & 0x7;
- static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
- return tab[mask];
-#else
- if(vec.x > vec.y) {
- if(vec.x > vec.z)
- return 0;
- else
- return 2;
- }
- else {
- if(vec.y > vec.z)
- return 1;
- else
- return 2;
- }
-#endif
-}
-
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
new file mode 100644
index 00000000000..6f9d0855d50
--- /dev/null
+++ b/intern/cycles/util/util_math_float2.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT2_H__
+#define __UTIL_MATH_FLOAT2_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a);
+ccl_device_inline float2 operator*(const float2& a, const float2& b);
+ccl_device_inline float2 operator*(const float2& a, float f);
+ccl_device_inline float2 operator*(float f, const float2& a);
+ccl_device_inline float2 operator/(float f, const float2& a);
+ccl_device_inline float2 operator/(const float2& a, float f);
+ccl_device_inline float2 operator/(const float2& a, const float2& b);
+ccl_device_inline float2 operator+(const float2& a, const float2& b);
+ccl_device_inline float2 operator-(const float2& a, const float2& b);
+ccl_device_inline float2 operator+=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, float f);
+ccl_device_inline float2 operator/=(float2& a, const float2& b);
+ccl_device_inline float2 operator/=(float2& a, float f);
+
+ccl_device_inline bool operator==(const float2& a, const float2& b);
+ccl_device_inline bool operator!=(const float2& a, const float2& b);
+
+ccl_device_inline bool is_zero(const float2& a);
+ccl_device_inline float average(const float2& a);
+ccl_device_inline float dot(const float2& a, const float2& b);
+ccl_device_inline float cross(const float2& a, const float2& b);
+ccl_device_inline float len(const float2& a);
+ccl_device_inline float2 normalize(const float2& a);
+ccl_device_inline float2 normalize_len(const float2& a, float *t);
+ccl_device_inline float2 safe_normalize(const float2& a);
+ccl_device_inline float2 min(const float2& a, const float2& b);
+ccl_device_inline float2 max(const float2& a, const float2& b);
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx);
+ccl_device_inline float2 fabs(const float2& a);
+ccl_device_inline float2 as_float2(const float4& a);
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t);
+#endif /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a)
+{
+ return make_float2(-a.x, -a.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
+{
+ return make_float2(a.x*b.x, a.y*b.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, float f)
+{
+ return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator*(float f, const float2& a)
+{
+ return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator/(float f, const float2& a)
+{
+ return make_float2(f/a.x, f/a.y);
+}
+
+ccl_device_inline float2 operator/(const float2& a, float f)
+{
+ float invf = 1.0f/f;
+ return make_float2(a.x*invf, a.y*invf);
+}
+
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
+{
+ return make_float2(a.x/b.x, a.y/b.y);
+}
+
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
+{
+ return make_float2(a.x+b.x, a.y+b.y);
+}
+
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
+{
+ return make_float2(a.x-b.x, a.y-b.y);
+}
+
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
+{
+ return a = a * b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, float f)
+{
+ return a = a * f;
+}
+
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
+{
+ return a = a / b;
+}
+
+ccl_device_inline float2 operator/=(float2& a, float f)
+{
+ float invf = 1.0f/f;
+ return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float2& a, const float2& b)
+{
+ return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
+{
+ return !(a == b);
+}
+
+ccl_device_inline bool is_zero(const float2& a)
+{
+ return (a.x == 0.0f && a.y == 0.0f);
+}
+
+ccl_device_inline float average(const float2& a)
+{
+ return (a.x + a.y)*(1.0f/2.0f);
+}
+
+ccl_device_inline float dot(const float2& a, const float2& b)
+{
+ return a.x*b.x + a.y*b.y;
+}
+
+ccl_device_inline float cross(const float2& a, const float2& b)
+{
+ return (a.x*b.y - a.y*b.x);
+}
+
+ccl_device_inline float len(const float2& a)
+{
+ return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float2 normalize(const float2& a)
+{
+ return a/len(a);
+}
+
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
+{
+ *t = len(a);
+ return a/(*t);
+}
+
+ccl_device_inline float2 safe_normalize(const float2& a)
+{
+ float t = len(a);
+ return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float2 min(const float2& a, const float2& b)
+{
+ return make_float2(min(a.x, b.x), min(a.y, b.y));
+}
+
+ccl_device_inline float2 max(const float2& a, const float2& b)
+{
+ return make_float2(max(a.x, b.x), max(a.y, b.y));
+}
+
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline float2 fabs(const float2& a)
+{
+ return make_float2(fabsf(a.x), fabsf(a.y));
+}
+
+ccl_device_inline float2 as_float2(const float4& a)
+{
+ return make_float2(a.x, a.y);
+}
+
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
+{
+ return a + t*(b - a);
+}
+#endif /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
new file mode 100644
index 00000000000..bb04c4aa2d9
--- /dev/null
+++ b/intern/cycles/util/util_math_float3.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT3_H__
+#define __UTIL_MATH_FLOAT3_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a);
+ccl_device_inline float3 operator*(const float3& a, const float3& b);
+ccl_device_inline float3 operator*(const float3& a, const float f);
+ccl_device_inline float3 operator*(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float3& a, const float f);
+ccl_device_inline float3 operator/(const float3& a, const float3& b);
+ccl_device_inline float3 operator+(const float3& a, const float3& b);
+ccl_device_inline float3 operator-(const float3& a, const float3& b);
+ccl_device_inline float3 operator+=(float3& a, const float3& b);
+ccl_device_inline float3 operator-=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, float f);
+ccl_device_inline float3 operator/=(float3& a, const float3& b);
+ccl_device_inline float3 operator/=(float3& a, float f);
+
+ccl_device_inline bool operator==(const float3& a, const float3& b);
+ccl_device_inline bool operator!=(const float3& a, const float3& b);
+
+ccl_device_inline float dot(const float3& a, const float3& b);
+ccl_device_inline float dot_xy(const float3& a, const float3& b);
+ccl_device_inline float3 cross(const float3& a, const float3& b);
+ccl_device_inline float3 normalize(const float3& a);
+ccl_device_inline float3 min(const float3& a, const float3& b);
+ccl_device_inline float3 max(const float3& a, const float3& b);
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx);
+ccl_device_inline float3 fabs(const float3& a);
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t);
+ccl_device_inline float3 rcp(const float3& a);
+#endif /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float max3(float3 a);
+ccl_device_inline float len(const float3 a);
+ccl_device_inline float len_squared(const float3 a);
+
+ccl_device_inline float3 saturate3(float3 a);
+ccl_device_inline float3 safe_normalize(const float3 a);
+ccl_device_inline float3 normalize_len(const float3 a, float *t);;
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 interp(float3 a, float3 b, float t);
+
+ccl_device_inline bool is_zero(const float3 a);
+ccl_device_inline float reduce_add(const float3 a);
+ccl_device_inline float average(const float3 a);
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
+ return make_float3(-a.x, -a.y, -a.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_mul_ps(a.m128,b.m128));
+#else
+ return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float f)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
+ return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float f, const float3& a)
+{
+ /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+ return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
+#else
+ return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float f, const float3& a)
+{
+ /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+ __m128 rc = _mm_rcp_ps(a.m128);
+ return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+ return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float f)
+{
+ float invf = 1.0f/f;
+ return a * invf;
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
+{
+ /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+ __m128 rc = _mm_rcp_ps(b.m128);
+ return float3(_mm_mul_ps(a, rc));
+#else
+ return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_add_ps(a.m128, b.m128));
+#else
+ return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
+}
+
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+ return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline float3 operator-=(float3& a, const float3& b)
+{
+ return a = a - b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
+{
+ return a = a * b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, float f)
+{
+ return a = a * f;
+}
+
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
+{
+ return a = a / b;
+}
+
+ccl_device_inline float3 operator/=(float3& a, float f)
+{
+ float invf = 1.0f/f;
+ return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
+ return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
+}
+
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
+{
+ return !(a == b);
+}
+
+ccl_device_inline float dot(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
+#else
+ return a.x*b.x + a.y*b.y + a.z*b.z;
+#endif
+}
+
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+ return a.x*b.x + a.y*b.y;
+#endif
+}
+
+ccl_device_inline float3 cross(const float3& a, const float3& b)
+{
+ float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+ return r;
+}
+
+ccl_device_inline float3 normalize(const float3& a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
+ return float3(_mm_div_ps(a.m128, norm));
+#else
+ return a/len(a);
+#endif
+}
+
+ccl_device_inline float3 min(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_min_ps(a.m128, b.m128));
+#else
+ return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 max(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_max_ps(a.m128, b.m128));
+#else
+ return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline float3 fabs(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return float3(_mm_and_ps(a.m128, mask));
+#else
+ return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
+}
+
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
+{
+ return a + t*(b - a);
+}
+
+ccl_device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+ const float4 r(_mm_rcp_ps(a.m128));
+ return float3(_mm_sub_ps(_mm_add_ps(r, r),
+ _mm_mul_ps(_mm_mul_ps(r, r), a)));
+#else
+ return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+#endif /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float max3(float3 a)
+{
+ return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+ return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+ return dot(a, a);
+}
+
+ccl_device_inline float3 saturate3(float3 a)
+{
+ return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+}
+
+ccl_device_inline float3 normalize_len(const float3 a, float *t)
+{
+ *t = len(a);
+ float x = 1.0f / *t;
+ return a*x;
+}
+
+ccl_device_inline float3 safe_normalize(const float3 a)
+{
+ float t = len(a);
+ return (t != 0.0f)? a * (1.0f/t) : a;
+}
+
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+{
+ *t = len(a);
+ return (*t != 0.0f)? a/(*t): a;
+}
+
+ccl_device_inline float3 interp(float3 a, float3 b, float t)
+{
+ return a + t*(b - a);
+}
+
+ccl_device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+ return a == make_float3(0.0f);
+#else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+ccl_device_inline float reduce_add(const float3 a)
+{
+ return (a.x + a.y + a.z);
+}
+
+ccl_device_inline float average(const float3 a)
+{
+ return reduce_add(a)*(1.0f/3.0f);
+}
+
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+{
+#ifdef __KERNEL_OPENCL__
+ return all(a == b);
+#else
+ return a == b;
+#endif
+}
+
+ccl_device_inline bool isfinite3_safe(float3 v)
+{
+ return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
+}
+
+ccl_device_inline float3 ensure_finite3(float3 v)
+{
+ if(!isfinite_safe(v.x)) v.x = 0.0f;
+ if(!isfinite_safe(v.y)) v.y = 0.0f;
+ if(!isfinite_safe(v.z)) v.z = 0.0f;
+ return v;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
new file mode 100644
index 00000000000..d89121b3a1d
--- /dev/null
+++ b/intern/cycles/util/util_math_float4.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT4_H__
+#define __UTIL_MATH_FLOAT4_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a);
+ccl_device_inline float4 operator*(const float4& a, const float4& b);
+ccl_device_inline float4 operator*(const float4& a, float f);
+ccl_device_inline float4 operator*(float f, const float4& a);
+ccl_device_inline float4 operator/(const float4& a, float f);
+ccl_device_inline float4 operator/(const float4& a, const float4& b);
+ccl_device_inline float4 operator+(const float4& a, const float4& b);
+ccl_device_inline float4 operator-(const float4& a, const float4& b);
+ccl_device_inline float4 operator+=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator/=(float4& a, float f);
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b);
+ccl_device_inline int4 operator>=(const float4& a, const float4& b);
+ccl_device_inline int4 operator<=(const float4& a, const float4& b);
+ccl_device_inline bool operator==(const float4& a, const float4& b);
+
+ccl_device_inline float dot(const float4& a, const float4& b);
+ccl_device_inline float len_squared(const float4& a);
+ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 cross(const float4& a, const float4& b);
+ccl_device_inline bool is_zero(const float4& a);
+ccl_device_inline float reduce_add(const float4& a);
+ccl_device_inline float average(const float4& a);
+ccl_device_inline float len(const float4& a);
+ccl_device_inline float4 normalize(const float4& a);
+ccl_device_inline float4 safe_normalize(const float4& a);
+ccl_device_inline float4 min(const float4& a, const float4& b);
+ccl_device_inline float4 max(const float4& a, const float4& b);
+#endif /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b);
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
+
+# ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
+# endif
+#endif /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+ const float4& a,
+ const float4& b);
+ccl_device_inline float4 reduce_min(const float4& a);
+ccl_device_inline float4 reduce_max(const float4& a);
+# if 0
+ccl_device_inline float4 reduce_add(const float4& a);
+# endif
+#endif /* !__KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return float4(_mm_xor_ps(a.m128, mask));
+#else
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_mul_ps(a.m128, b.m128));
+#else
+ return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, float f)
+{
+#if defined(__KERNEL_SSE__)
+ return a * make_float4(f);
+#else
+ return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
+}
+
+ccl_device_inline float4 operator*(float f, const float4& a)
+{
+ return a * f;
+}
+
+ccl_device_inline float4 operator/(const float4& a, float f)
+{
+ return a * (1.0f/f);
+}
+
+ccl_device_inline float4 operator/(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return a * rcp(b);
+#else
+ return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
+}
+
+ccl_device_inline float4 operator+(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_add_ps(a.m128, b.m128));
+#else
+ return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline float4 operator-(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_sub_ps(a.m128, b.m128));
+#else
+ return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
+}
+
+ccl_device_inline float4 operator+=(float4& a, const float4& b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline float4 operator*=(float4& a, const float4& b)
+{
+ return a = a * b;
+}
+
+ccl_device_inline float4 operator/=(float4& a, float f)
+{
+ return a = a / f;
+}
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ /* TODO(sergey): avoid cvt. */
+ return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)));
+#else
+ return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ /* TODO(sergey): avoid cvt. */
+ return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)));
+#else
+ return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ /* TODO(sergey): avoid cvt. */
+ return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)));
+#else
+ return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+ccl_device_inline bool operator==(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+ return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
+}
+
+ccl_device_inline float dot(const float4& a, const float4& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#else
+ return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
+#endif
+}
+
+ccl_device_inline float len_squared(const float4& a)
+{
+ return dot(a, a);
+}
+
+ccl_device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 r(_mm_rcp_ps(a.m128));
+ return float4(_mm_sub_ps(_mm_add_ps(r, r),
+ _mm_mul_ps(_mm_mul_ps(r, r), a)));
+#else
+ return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
+}
+
+ccl_device_inline float4 cross(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) -
+ (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+ return make_float4(a.y*b.z - a.z*b.y,
+ a.z*b.x - a.x*b.z,
+ a.x*b.y - a.y*b.x,
+ 0.0f);
+#endif
+}
+
+ccl_device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ return a == make_float4(0.0f);
+#else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+ccl_device_inline float reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h(shuffle<1,0,3,2>(a) + a);
+ /* TODO(sergey): Investigate efficiency. */
+ return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+#else
+ return ((a.x + a.y) + (a.z + a.w));
+#endif
+}
+
+ccl_device_inline float average(const float4& a)
+{
+ return reduce_add(a) * 0.25f;
+}
+
+ccl_device_inline float len(const float4& a)
+{
+ return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float4 normalize(const float4& a)
+{
+ return a/len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4& a)
+{
+ float t = len(a);
+ return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float4 min(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_min_ps(a.m128, b.m128));
+#else
+ return make_float4(min(a.x, b.x),
+ min(a.y, b.y),
+ min(a.z, b.z),
+ min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline float4 max(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_max_ps(a.m128, b.m128));
+#else
+ return make_float4(max(a.x, b.x),
+ max(a.y, b.y),
+ max(a.z, b.z),
+ max(a.w, b.w));
+#endif
+}
+#endif /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b)
+{
+ return float4(_mm_castsi128_ps(
+ _mm_shuffle_epi32(_mm_castps_si128(b),
+ _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+ return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+}
+
+# ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
+{
+ return float4(_mm_moveldup_ps(b));
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+ return float4(_mm_movehdup_ps(b));
+}
+# endif /* __KERNEL_SSE3__ */
+#endif /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+ const float4& a,
+ const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ /* TODO(sergey): avoid cvt. */
+ return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
+ _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+#else
+ return make_float4((mask.x)? a.x: b.x,
+ (mask.y)? a.y: b.y,
+ (mask.z)? a.z: b.z,
+ (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = min(shuffle<1,0,3,2>(a), a);
+ return min(shuffle<2,3,0,1>(h), h);
+#else
+ return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+ccl_device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = max(shuffle<1,0,3,2>(a), a);
+ return max(shuffle<2,3,0,1>(h), h);
+#else
+ return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+#if 0
+ccl_device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = shuffle<1,0,3,2>(a) + a;
+ return shuffle<2,3,0,1>(h) + h;
+#else
+ return make_float4((a.x + a.y) + (a.z + a.w));
+#endif
+}
+#endif
+#endif /* !__KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
new file mode 100644
index 00000000000..828c49a131c
--- /dev/null
+++ b/intern/cycles/util/util_math_int2.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT2_H__
+#define __UTIL_MATH_INT2_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b);
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b)
+{
+ return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
+{
+ return make_int2(a.x + b.x, a.y + b.y);
+}
+
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
+{
+ return make_int2(a.x - b.x, a.y - b.y);
+}
+
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
+{
+ return make_int2(a.x * b.x, a.y * b.y);
+}
+
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
+{
+ return make_int2(a.x / b.x, a.y / b.y);
+}
+#endif /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT2_H__ */
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
new file mode 100644
index 00000000000..fa7a02636de
--- /dev/null
+++ b/intern/cycles/util/util_math_int3.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT3_H__
+#define __UTIL_MATH_INT3_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b);
+ccl_device_inline int3 max(int3 a, int3 b);
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx);
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx);
+#endif /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+ return int3(_mm_min_epi32(a.m128, b.m128));
+#else
+ return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 max(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+ return int3(_mm_max_epi32(a.m128, b.m128));
+#else
+ return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+ return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+ return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+ return min(max(a, mn), make_int3(mx));
+#else
+ return make_int3(clamp(a.x, mn.x, mx),
+ clamp(a.y, mn.y, mx),
+ clamp(a.z, mn.z, mx));
+#endif
+}
+#endif /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
new file mode 100644
index 00000000000..79a8c0841e7
--- /dev/null
+++ b/intern/cycles/util/util_math_int4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT4_H__
+#define __UTIL_MATH_INT4_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b);
+ccl_device_inline int4 operator+=(int4& a, const int4& b);
+ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 min(int4 a, int4 b);
+ccl_device_inline int4 max(int4 a, int4 b);
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b);
+#endif /* __KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_add_epi32(a.m128, b.m128));
+#else
+ return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline int4 operator+=(int4& a, const int4& b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_srai_epi32(a.m128, i));
+#else
+ return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
+#endif
+}
+
+ccl_device_inline int4 min(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+ return int4(_mm_min_epi32(a.m128, b.m128));
+#else
+ return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 max(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+ return int4(_mm_max_epi32(a.m128, b.m128));
+#else
+ return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ const __m128 m = _mm_cvtepi32_ps(mask);
+ /* TODO(sergey): avoid cvt. */
+ return int4(_mm_castps_si128(
+ _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)),
+ _mm_andnot_ps(m, _mm_castsi128_ps(b)))));
+#else
+ return make_int4((mask.x)? a.x: b.x,
+ (mask.y)? a.y: b.y,
+ (mask.z)? a.z: b.z,
+ (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline int4 load_int4(const int *v)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_loadu_si128((__m128i*)v));
+#else
+ return make_int4(v[0], v[1], v[2], v[3]);
+#endif
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
new file mode 100644
index 00000000000..c7511f8306e
--- /dev/null
+++ b/intern/cycles/util/util_math_matrix.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_MATRIX_H__
+#define __UTIL_MATH_MATRIX_H__
+
+CCL_NAMESPACE_BEGIN
+
+#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+# define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+# define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+# define VECS(V, i, s) V[(i)*(s)]
+#else
+# define MATS(A, n, r, c, s) MAT(A, n, r, c)
+# define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+# define VECS(V, i, s) V[i]
+#endif
+
+/* Zeroing helpers. */
+
+ccl_device_inline void math_vector_zero(float *v, int n)
+{
+ for(int i = 0; i < n; i++) {
+ v[i] = 0.0f;
+ }
+}
+
+ccl_device_inline void math_matrix_zero(float *A, int n)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = 0.0f;
+ }
+ }
+}
+
+/* Elementary vector operations. */
+
+ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i] += b[i];
+ }
+}
+
+ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i] *= b[i];
+ }
+}
+
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i*astride] *= b[i];
+ }
+}
+
+ccl_device_inline void math_vector_scale(float *a, float b, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i] *= b;
+ }
+}
+
+ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i] = max(a[i], b[i]);
+ }
+}
+
+ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+{
+ for(int i = 0; i < n; i++) {
+ v[i] += w*x[i];
+ }
+}
+
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
+{
+ for(int i = 0; i < n; i++) {
+ v[i*stride] += w*x[i];
+ }
+}
+
+/* Elementary matrix operations.
+ * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+{
+ for(int row = 0; row < n; row++) {
+ MATHS(A, row, row, stride) += val;
+ }
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
+ int n,
+ const float *ccl_restrict v,
+ float weight)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) += v[row]*v[col]*weight;
+ }
+ }
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
+ int n,
+ const float *ccl_restrict v,
+ float weight,
+ int stride)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+ }
+ }
+}
+
+/* Transpose matrix A inplace. */
+ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
+{
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < i; j++) {
+ float temp = MATS(A, n, i, j, stride);
+ MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
+ MATS(A, n, j, i, stride) = temp;
+ }
+ }
+}
+
+/* Solvers for matrix problems */
+
+/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
+ * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
+ * Also, only the lower triangular part of A is ever accessed. */
+ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ float sum_col = MATHS(A, row, col, stride);
+ for(int k = 0; k < col; k++) {
+ sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
+ }
+ if(row == col) {
+ sum_col = sqrtf(max(sum_col, 0.0f));
+ }
+ else {
+ sum_col /= MATHS(A, col, col, stride);
+ }
+ MATHS(A, row, col, stride) = sum_col;
+ }
+ }
+}
+
+/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+ *
+ * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
+ * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
+ * Since L is lower triangular, finding b is relatively easy since y is known.
+ * Then, the remaining problem is Lt*S = b, which again can be solved easily.
+ *
+ * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
+ * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
+{
+ /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
+ * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction
+ * is scaled based on it. */
+ math_trimatrix_add_diagonal(A, n, 3e-7f*A[0], stride); /* Improve the numerical stability. */
+ math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
+
+ /* Use forward substitution to solve L*b = y, replacing y by b. */
+ for(int row = 0; row < n; row++) {
+ float3 sum = VECS(y, row, stride);
+ for(int col = 0; col < row; col++)
+ sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+ }
+
+ /* Use backward substitution to solve Lt*S = b, replacing b by S. */
+ for(int row = n-1; row >= 0; row--) {
+ float3 sum = VECS(y, row, stride);
+ for(int col = row+1; col < n; col++)
+ sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+ }
+}
+
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ */
+ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride)
+{
+ const float singular_epsilon = 1e-9f;
+
+ for (int row = 0; row < n; row++) {
+ for (int col = 0; col < n; col++) {
+ MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+ }
+ }
+
+ for (int sweep = 0; sweep < 8; sweep++) {
+ float off_diagonal = 0.0f;
+ for (int row = 1; row < n; row++) {
+ for (int col = 0; col < row; col++) {
+ off_diagonal += fabsf(MAT(A, n, row, col));
+ }
+ }
+ if (off_diagonal < 1e-7f) {
+ /* The matrix has nearly reached diagonal form.
+ * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+ break;
+ }
+
+ /* Set the threshold for the small element rotation skip in the first sweep:
+ * Skip all elements that are less than a tenth of the average off-diagonal element. */
+ float threshold = 0.2f*off_diagonal / (n*n);
+
+ for(int row = 1; row < n; row++) {
+ for(int col = 0; col < row; col++) {
+ /* Perform a Jacobi rotation on this element that reduces it to zero. */
+ float element = MAT(A, n, row, col);
+ float abs_element = fabsf(element);
+
+ /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+ if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+ MAT(A, n, row, col) = 0.0f;
+ continue;
+ }
+
+ if(element == 0.0f) {
+ continue;
+ }
+
+ /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+ if(sweep < 3 && (abs_element < threshold)) {
+ continue;
+ }
+
+ /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
+ * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+ * Then, we compute sin(phi) and cos(phi) themselves. */
+ float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
+ float ratio;
+ if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+ float cot_2phi = 0.5f*singular_diff / element;
+ ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
+ if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+ }
+ else {
+ ratio = element / singular_diff;
+ }
+
+ float c = 1.0f / sqrtf(1.0f + ratio*ratio);
+ float s = ratio*c;
+ /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+ float tan_phi_2 = s / (1.0f + c);
+
+ /* Update the singular values in the diagonal. */
+ float singular_delta = ratio*element;
+ MAT(A, n, row, row) += singular_delta;
+ MAT(A, n, col, col) -= singular_delta;
+
+ /* Set the element itself to zero. */
+ MAT(A, n, row, col) = 0.0f;
+
+ /* Perform the actual rotations on the matrices. */
+#define ROT(M, r1, c1, r2, c2, stride) \
+ { \
+ float M1 = MATS(M, n, r1, c1, stride); \
+ float M2 = MATS(M, n, r2, c2, stride); \
+ MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \
+ MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \
+ }
+
+ /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+ for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1);
+ for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1);
+ for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1);
+
+ for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride);
+#undef ROT
+ }
+ }
+ }
+
+ /* Sort eigenvalues and the associated eigenvectors. */
+ for (int i = 0; i < n - 1; i++) {
+ float v = MAT(A, n, i, i);
+ int k = i;
+ for (int j = i; j < n; j++) {
+ if (MAT(A, n, j, j) >= v) {
+ v = MAT(A, n, j, j);
+ k = j;
+ }
+ }
+ if (k != i) {
+ /* Swap eigenvalues. */
+ MAT(A, n, k, k) = MAT(A, n, i, i);
+ MAT(A, n, i, i) = v;
+ /* Swap eigenvectors. */
+ for (int j = 0; j < n; j++) {
+ float v = MATS(V, n, i, j, v_stride);
+ MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
+ MATS(V, n, k, j, v_stride) = v;
+ }
+ }
+ }
+}
+
+#ifdef __KERNEL_SSE3__
+ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+{
+ for(int i = 0; i < n; i++) {
+ A[i] = _mm_setzero_ps();
+ }
+}
+
+ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = _mm_setzero_ps();
+ }
+ }
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+ }
+ }
+}
+
+ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+{
+ for(int i = 0; i < n; i++) {
+ V[i] = _mm_add_ps(V[i], a[i]);
+ }
+}
+
+ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+{
+ for(int i = 0; i < n; i++) {
+ V[i] = _mm_mul_ps(V[i], a[i]);
+ }
+}
+
+ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+{
+ for(int i = 0; i < n; i++) {
+ a[i] = _mm_max_ps(a[i], b[i]);
+ }
+}
+
+ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+ }
+ }
+}
+#endif
+
+#undef MAT
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_MATRIX_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index cd3067f7650..f9c3b4bb139 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -768,9 +768,17 @@ bool path_remove(const string& path)
return remove(path.c_str()) == 0;
}
-static string line_directive(const string& path, int line)
+static string line_directive(const string& base, const string& path, int line)
{
string escaped_path = path;
+ /* First we make path relative. */
+ if(string_startswith(escaped_path, base.c_str())) {
+ const string base_file = path_filename(base);
+ const size_t base_len = base.length();
+ escaped_path = base_file + escaped_path.substr(base_len,
+ escaped_path.length() - base_len);
+ }
+ /* Second, we replace all unsafe characters. */
string_replace(escaped_path, "\"", "\\\"");
string_replace(escaped_path, "\'", "\\\'");
string_replace(escaped_path, "\?", "\\\?");
@@ -778,13 +786,13 @@ static string line_directive(const string& path, int line)
return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
}
-
-string path_source_replace_includes(const string& source,
- const string& path,
- const string& source_filename)
+static string path_source_replace_includes_recursive(
+ const string& base,
+ const string& source,
+ const string& source_filepath)
{
/* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of opencl drivers not supporting
+ * contents, to work around issue of OpenCL drivers not supporting
* include paths with spaces in them.
*/
@@ -799,23 +807,22 @@ string path_source_replace_includes(const string& source,
if(string_startswith(token, "include")) {
token = string_strip(token.substr(7, token.size() - 7));
if(token[0] == '"') {
- size_t n_start = 1;
- size_t n_end = token.find("\"", n_start);
- string filename = token.substr(n_start, n_end - n_start);
- string text, filepath = path_join(path, filename);
+ const size_t n_start = 1;
+ const size_t n_end = token.find("\"", n_start);
+ const string filename = token.substr(n_start, n_end - n_start);
+ string filepath = path_join(base, filename);
+ if(!path_exists(filepath)) {
+ filepath = path_join(path_dirname(source_filepath),
+ filename);
+ }
+ string text;
if(path_read_text(filepath, text)) {
- /* Replace include directories with both current path
- * and path extracted from the include file.
- * Not totally robust, but works fine for Cycles kernel
- * and avoids having list of include directories.x
- */
- text = path_source_replace_includes(
- text, path_dirname(filepath), filename);
- text = path_source_replace_includes(text, path, filename);
+ text = path_source_replace_includes_recursive(
+ base, text, filepath);
/* Use line directives for better error messages. */
- line = line_directive(filepath, 1)
+ line = line_directive(base, filepath, 1)
+ token.replace(0, n_end + 1, "\n" + text + "\n")
- + line_directive(path_join(path, source_filename), i + 1);
+ + line_directive(base, source_filepath, i + 1);
}
}
}
@@ -826,6 +833,16 @@ string path_source_replace_includes(const string& source,
return result;
}
+string path_source_replace_includes(const string& source,
+ const string& path,
+ const string& source_filename)
+{
+ return path_source_replace_includes_recursive(
+ path,
+ source,
+ path_join(path, source_filename));
+}
+
FILE *path_fopen(const string& path, const string& mode)
{
#ifdef _WIN32
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 39c1eed04e7..134383e88db 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -37,9 +37,11 @@ public:
pixel_samples = 0;
total_pixel_samples = 0;
current_tile_sample = 0;
- finished_tiles = 0;
+ rendered_tiles = 0;
+ denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ end_time = 0.0;
status = "Initializing";
substatus = "";
sync_status = "";
@@ -75,9 +77,11 @@ public:
pixel_samples = 0;
total_pixel_samples = 0;
current_tile_sample = 0;
- finished_tiles = 0;
+ rendered_tiles = 0;
+ denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ end_time = 0.0;
status = "Initializing";
substatus = "";
sync_status = "";
@@ -144,6 +148,7 @@ public:
thread_scoped_lock lock(progress_mutex);
start_time = time_dt();
+ end_time = 0.0;
}
void set_render_start_time()
@@ -167,8 +172,15 @@ public:
{
thread_scoped_lock lock(progress_mutex);
- total_time_ = time_dt() - start_time;
- render_time_ = time_dt() - render_start_time;
+ double time = (end_time > 0) ? end_time : time_dt();
+
+ total_time_ = time - start_time;
+ render_time_ = time - render_start_time;
+ }
+
+ void set_end_time()
+ {
+ end_time = time_dt();
}
void reset_sample()
@@ -177,7 +189,8 @@ public:
pixel_samples = 0;
current_tile_sample = 0;
- finished_tiles = 0;
+ rendered_tiles = 0;
+ denoised_tiles = 0;
}
void set_total_pixel_samples(uint64_t total_pixel_samples_)
@@ -209,23 +222,36 @@ public:
set_update();
}
- void add_finished_tile()
+ void add_finished_tile(bool denoised)
{
thread_scoped_lock lock(progress_mutex);
- finished_tiles++;
+ if(denoised) {
+ denoised_tiles++;
+ }
+ else {
+ rendered_tiles++;
+ }
}
int get_current_sample()
{
+ thread_scoped_lock lock(progress_mutex);
/* Note that the value here always belongs to the last tile that updated,
* so it's only useful if there is only one active tile. */
return current_tile_sample;
}
- int get_finished_tiles()
+ int get_rendered_tiles()
+ {
+ thread_scoped_lock lock(progress_mutex);
+ return rendered_tiles;
+ }
+
+ int get_denoised_tiles()
{
- return finished_tiles;
+ thread_scoped_lock lock(progress_mutex);
+ return denoised_tiles;
}
/* status messages */
@@ -318,9 +344,11 @@ protected:
int current_tile_sample;
/* Stores the number of tiles that's already finished.
* Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
- int finished_tiles;
+ int rendered_tiles, denoised_tiles;
double start_time, render_start_time;
+ /* End time written when render is done, so it doesn't keep increasing on redraws. */
+ double end_time;
string status;
string substatus;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 557809a5719..587febe3e52 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v)
static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
static const size_t BITSCAN_NO_BIT_SET_64 = 64;
+#ifdef __KERNEL_SSE3__
/* Emulation of SSE4 functions with SSE3 */
-
-#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+# ifndef __KERNEL_SSE41__
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
@@ -341,42 +341,48 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64;
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
+#undef _mm_blendv_ps
#define _mm_blendv_ps __emu_mm_blendv_ps
__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) {
return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value));
}
+#undef _mm_blend_ps
#define _mm_blend_ps __emu_mm_blend_ps
__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) {
assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
}
+#undef _mm_blendv_epi8
#define _mm_blendv_epi8 __emu_mm_blendv_epi8
__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) {
return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
}
+#undef _mm_mullo_epi32
#define _mm_mullo_epi32 __emu_mm_mullo_epi32
__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
__m128i rvalue;
char* _r = (char*)(&rvalue + 1);
char* _v = (char*)(& value + 1);
char* _i = (char*)(& input + 1);
- for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i));
+ for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i));
return rvalue;
}
-
+#undef _mm_min_epi32
#define _mm_min_epi32 __emu_mm_min_epi32
__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) {
return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
}
+#undef _mm_max_epi32
#define _mm_max_epi32 __emu_mm_max_epi32
__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) {
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
+#undef _mm_extract_epi32
#define _mm_extract_epi32 __emu_mm_extract_epi32
__forceinline int _mm_extract_epi32( __m128i input, const int index ) {
switch ( index ) {
@@ -388,20 +394,24 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
}
}
+#undef _mm_insert_epi32
#define _mm_insert_epi32 __emu_mm_insert_epi32
__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) {
assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value;
}
+#undef _mm_extract_ps
#define _mm_extract_ps __emu_mm_extract_ps
__forceinline int _mm_extract_ps( __m128 input, const int index ) {
- int32* ptr = (int32*)&input; return ptr[index];
+ int32_t* ptr = (int32_t*)&input; return ptr[index];
}
+#undef _mm_insert_ps
#define _mm_insert_ps __emu_mm_insert_ps
__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
+#undef _mm_round_ps
#define _mm_round_ps __emu_mm_round_ps
__forceinline __m128 _mm_round_ps( __m128 value, const int flags )
{
@@ -415,18 +425,55 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
return value;
}
-#ifdef _M_X64
+# ifdef _M_X64
+#undef _mm_insert_epi64
#define _mm_insert_epi64 __emu_mm_insert_epi64
__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) {
assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value;
}
+#undef _mm_extract_epi64
#define _mm_extract_epi64 __emu_mm_extract_epi64
__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) {
assert(size_t(index) < 2);
return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input));
}
-#endif
+# endif
+
+# endif
+
+#undef _mm_fabs_ps
+#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+
+/* Return a __m128 with every element set to the largest element of v. */
+ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
+{
+ /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
+ v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
+ /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
+ v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
+ return v;
+}
+
+/* Return the sum of the four elements of x. */
+ccl_device_inline float _mm_hsum_ss(__m128 x)
+{
+ __m128 a = _mm_movehdup_ps(x);
+ __m128 b = _mm_add_ps(x, a);
+ return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
+}
+
+/* Return a __m128 with every element set to the sum of the four elements of x. */
+ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
+{
+ x = _mm_hadd_ps(x, x);
+ x = _mm_hadd_ps(x, x);
+ return x;
+}
+
+/* Replace elements of x with zero where mask isn't set. */
+#undef _mm_mask_ps
+#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
#endif
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index a1008d510d1..94ad512982c 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other)
string string_remove_trademark(const string &s)
{
string result = s;
+
+ /* Special case, so we don;t leave sequential spaces behind. */
+ /* TODO(sergey): Consider using regex perhaps? */
+ string_replace(result, " (TM)", "");
+ string_replace(result, " (R)", "");
+
string_replace(result, "(TM)", "");
string_replace(result, "(R)", "");
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index fb0c34e1dc4..6ed97b0e0a6 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads)
threads.resize(num_threads);
const int num_groups = system_cpu_group_count();
- unsigned short num_process_groups;
+ unsigned short num_process_groups = 0;
vector<unsigned short> process_groups;
- int current_group_threads;
+ int current_group_threads = 0;
if(num_groups > 1) {
process_groups.resize(num_groups);
num_process_groups = system_cpu_process_groups(num_groups,
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index aff928ea2ee..df255f43059 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -21,62 +21,22 @@ CCL_NAMESPACE_BEGIN
/* Texture limits on devices. */
-/* CPU */
-#define TEX_NUM_FLOAT4_CPU 1024
-#define TEX_NUM_BYTE4_CPU 1024
-#define TEX_NUM_HALF4_CPU 1024
-#define TEX_NUM_FLOAT_CPU 1024
-#define TEX_NUM_BYTE_CPU 1024
-#define TEX_NUM_HALF_CPU 1024
-#define TEX_START_FLOAT4_CPU 0
-#define TEX_START_BYTE4_CPU TEX_NUM_FLOAT4_CPU
-#define TEX_START_HALF4_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU)
-#define TEX_START_FLOAT_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU)
-#define TEX_START_BYTE_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU)
-#define TEX_START_HALF_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU + TEX_NUM_BYTE_CPU)
-
/* CUDA (Geforce 4xx and 5xx) */
-#define TEX_NUM_FLOAT4_CUDA 5
-#define TEX_NUM_BYTE4_CUDA 85
-#define TEX_NUM_HALF4_CUDA 0
-#define TEX_NUM_FLOAT_CUDA 0
-#define TEX_NUM_BYTE_CUDA 0
-#define TEX_NUM_HALF_CUDA 0
-#define TEX_START_FLOAT4_CUDA 0
-#define TEX_START_BYTE4_CUDA TEX_NUM_FLOAT4_CUDA
-#define TEX_START_HALF4_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA)
-#define TEX_START_FLOAT_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA)
-#define TEX_START_BYTE_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA)
-#define TEX_START_HALF_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA)
-
-/* CUDA (Kepler, Geforce 6xx and above) */
-#define TEX_NUM_FLOAT4_CUDA_KEPLER 1024
-#define TEX_NUM_BYTE4_CUDA_KEPLER 1024
-#define TEX_NUM_HALF4_CUDA_KEPLER 1024
-#define TEX_NUM_FLOAT_CUDA_KEPLER 1024
-#define TEX_NUM_BYTE_CUDA_KEPLER 1024
-#define TEX_NUM_HALF_CUDA_KEPLER 1024
-#define TEX_START_FLOAT4_CUDA_KEPLER 0
-#define TEX_START_BYTE4_CUDA_KEPLER TEX_NUM_FLOAT4_CUDA_KEPLER
-#define TEX_START_HALF4_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER)
-#define TEX_START_FLOAT_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER)
-#define TEX_START_BYTE_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER)
-#define TEX_START_HALF_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER + TEX_NUM_BYTE_CUDA_KEPLER)
-
-/* OpenCL */
-#define TEX_NUM_FLOAT4_OPENCL 1024
-#define TEX_NUM_BYTE4_OPENCL 1024
-#define TEX_NUM_HALF4_OPENCL 0
-#define TEX_NUM_FLOAT_OPENCL 1024
-#define TEX_NUM_BYTE_OPENCL 1024
-#define TEX_NUM_HALF_OPENCL 0
-#define TEX_START_FLOAT4_OPENCL 0
-#define TEX_START_BYTE4_OPENCL TEX_NUM_FLOAT4_OPENCL
-#define TEX_START_HALF4_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL)
-#define TEX_START_FLOAT_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL)
-#define TEX_START_BYTE_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL)
-#define TEX_START_HALF_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL + TEX_NUM_BYTE_OPENCL)
-
+#define TEX_NUM_FLOAT4_CUDA 5
+#define TEX_NUM_BYTE4_CUDA 84
+#define TEX_NUM_HALF4_CUDA 0
+#define TEX_NUM_FLOAT_CUDA 0
+#define TEX_NUM_BYTE_CUDA 0
+#define TEX_NUM_HALF_CUDA 0
+#define TEX_START_FLOAT4_CUDA 0
+#define TEX_START_BYTE4_CUDA TEX_NUM_FLOAT4_CUDA
+#define TEX_START_HALF4_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA)
+#define TEX_START_FLOAT_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA)
+#define TEX_START_BYTE_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA)
+#define TEX_START_HALF_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA)
+
+/* Any architecture other than old CUDA cards */
+#define TEX_NUM_MAX (INT_MAX >> 4)
/* Color to use when textures are not found. */
#define TEX_IMAGE_MISSING_R 1
@@ -84,6 +44,14 @@ CCL_NAMESPACE_BEGIN
#define TEX_IMAGE_MISSING_B 1
#define TEX_IMAGE_MISSING_A 1
+#if defined (__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+# define kernel_tex_type(tex) (tex < TEX_START_BYTE4_CUDA ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_BYTE4)
+# define kernel_tex_index(tex) (tex)
+#else
+# define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK)
+# define kernel_tex_index(tex) (tex >> IMAGE_DATA_TYPE_SHIFT)
+#endif
+
CCL_NAMESPACE_END
#endif /* __UTIL_TEXTURE_H__ */
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index aa22f6a2c57..a5d1d7152d5 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -18,78 +18,75 @@
#define __UTIL_TYPES_H__
#ifndef __KERNEL_OPENCL__
-
-#include <stdlib.h>
-
+# include <stdlib.h>
#endif
/* Bitness */
#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#define __KERNEL_64_BIT__
+# define __KERNEL_64_BIT__
#endif
/* Qualifiers for kernel code shared by CPU and GPU */
#ifndef __KERNEL_GPU__
-
-#define ccl_device static inline
-#define ccl_device_noinline static
-#define ccl_global
-#define ccl_constant
-#define ccl_local
-#define ccl_local_param
-#define ccl_private
-#define ccl_restrict __restrict
-#define __KERNEL_WITH_SSE_ALIGN__
-
-#if defined(_WIN32) && !defined(FREE_WINDOWS)
-#define ccl_device_inline static __forceinline
-#define ccl_device_forceinline static __forceinline
-#define ccl_align(...) __declspec(align(__VA_ARGS__))
-#ifdef __KERNEL_64_BIT__
-#define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#else
-#undef __KERNEL_WITH_SSE_ALIGN__
-#define ccl_try_align(...) /* not support for function arguments (error C2719) */
-#endif
-#define ccl_may_alias
-#define ccl_always_inline __forceinline
-#define ccl_never_inline __declspec(noinline)
-#define ccl_maybe_unused
-
-#else
-
-#define ccl_device_inline static inline __attribute__((always_inline))
-#define ccl_device_forceinline static inline __attribute__((always_inline))
-#define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#ifndef FREE_WINDOWS64
-#define __forceinline inline __attribute__((always_inline))
-#endif
-#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#define ccl_may_alias __attribute__((__may_alias__))
-#define ccl_always_inline __attribute__((always_inline))
-#define ccl_never_inline __attribute__((noinline))
-#define ccl_maybe_unused __attribute__((used))
-
-#endif
-
-#endif
+# define ccl_device static inline
+# define ccl_device_noinline static
+# define ccl_global
+# define ccl_constant
+# define ccl_local
+# define ccl_local_param
+# define ccl_private
+# define ccl_restrict __restrict
+# define __KERNEL_WITH_SSE_ALIGN__
+
+# if defined(_WIN32) && !defined(FREE_WINDOWS)
+# define ccl_device_inline static __forceinline
+# define ccl_device_forceinline static __forceinline
+# define ccl_align(...) __declspec(align(__VA_ARGS__))
+# ifdef __KERNEL_64_BIT__
+# define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+# else /* __KERNEL_64_BIT__ */
+# undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+# define ccl_try_align(...)
+# endif /* __KERNEL_64_BIT__ */
+# define ccl_may_alias
+# define ccl_always_inline __forceinline
+# define ccl_never_inline __declspec(noinline)
+# define ccl_maybe_unused
+# else /* _WIN32 && !FREE_WINDOWS */
+# define ccl_device_inline static inline __attribute__((always_inline))
+# define ccl_device_forceinline static inline __attribute__((always_inline))
+# define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+# ifndef FREE_WINDOWS64
+# define __forceinline inline __attribute__((always_inline))
+# endif
+# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+# define ccl_may_alias __attribute__((__may_alias__))
+# define ccl_always_inline __attribute__((always_inline))
+# define ccl_never_inline __attribute__((noinline))
+# define ccl_maybe_unused __attribute__((used))
+# endif /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
+# define ATTR_FALLTHROUGH __attribute__((fallthrough))
+# else
+# define ATTR_FALLTHROUGH ((void)0)
+# endif
+#endif /* __KERNEL_GPU__ */
/* Standard Integer Types */
#ifndef __KERNEL_GPU__
-
/* int8_t, uint16_t, and friends */
-#ifndef _WIN32
-#include <stdint.h>
-#endif
-
+# ifndef _WIN32
+# include <stdint.h>
+# endif
/* SIMD Types */
-
-#include "util/util_optimization.h"
-
-#endif
+# include "util/util_optimization.h"
+#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_BEGIN
@@ -102,24 +99,19 @@ CCL_NAMESPACE_BEGIN
/* Shorter Unsigned Names */
#ifndef __KERNEL_OPENCL__
-
typedef unsigned char uchar;
typedef unsigned int uint;
-
+typedef unsigned short ushort;
#endif
/* Fixed Bits Types */
#ifdef __KERNEL_OPENCL__
-
typedef ulong uint64_t;
-
#endif
#ifndef __KERNEL_GPU__
-
-#ifdef _WIN32
-
+# ifdef _WIN32
typedef signed char int8_t;
typedef unsigned char uint8_t;
@@ -131,360 +123,26 @@ typedef unsigned int uint32_t;
typedef long long int64_t;
typedef unsigned long long uint64_t;
-
-#ifdef __KERNEL_64_BIT__
+# ifdef __KERNEL_64_BIT__
typedef int64_t ssize_t;
-#else
+# else
typedef int32_t ssize_t;
-#endif
-
-#endif
+# endif
+# endif /* _WIN32 */
/* Generic Memory Pointer */
typedef uint64_t device_ptr;
+#endif /* __KERNEL_GPU__ */
-/* Vector Types */
-
-struct uchar2 {
- uchar x, y;
-
- __forceinline uchar operator[](int i) const { return *(&x + i); }
- __forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar3 {
- uchar x, y, z;
-
- __forceinline uchar operator[](int i) const { return *(&x + i); }
- __forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar4 {
- uchar x, y, z, w;
-
- __forceinline uchar operator[](int i) const { return *(&x + i); }
- __forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct int2 {
- int x, y;
-
- __forceinline int operator[](int i) const { return *(&x + i); }
- __forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int3 {
-#ifdef __KERNEL_SSE__
- union {
- __m128i m128;
- struct { int x, y, z, w; };
- };
-
- __forceinline int3() {}
- __forceinline int3(const __m128i& a) : m128(a) {}
- __forceinline operator const __m128i&(void) const { return m128; }
- __forceinline operator __m128i&(void) { return m128; }
-
- int3(const int3& a) { m128 = a.m128; }
- int3& operator =(const int3& a) { m128 = a.m128; return *this; }
-#else
- int x, y, z, w;
-#endif
-
- __forceinline int operator[](int i) const { return *(&x + i); }
- __forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int4 {
-#ifdef __KERNEL_SSE__
- union {
- __m128i m128;
- struct { int x, y, z, w; };
- };
-
- __forceinline int4() {}
- __forceinline int4(const __m128i& a) : m128(a) {}
- __forceinline operator const __m128i&(void) const { return m128; }
- __forceinline operator __m128i&(void) { return m128; }
-
- int4(const int4& a) : m128(a.m128) {}
- int4& operator=(const int4& a) { m128 = a.m128; return *this; }
-#else
- int x, y, z, w;
-#endif
-
- __forceinline int operator[](int i) const { return *(&x + i); }
- __forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct uint2 {
- uint x, y;
-
- __forceinline uint operator[](uint i) const { return *(&x + i); }
- __forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint3 {
- uint x, y, z;
-
- __forceinline uint operator[](uint i) const { return *(&x + i); }
- __forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint4 {
- uint x, y, z, w;
-
- __forceinline uint operator[](uint i) const { return *(&x + i); }
- __forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct float2 {
- float x, y;
-
- __forceinline float operator[](int i) const { return *(&x + i); }
- __forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float3 {
-#ifdef __KERNEL_SSE__
- union {
- __m128 m128;
- struct { float x, y, z, w; };
- };
-
- __forceinline float3() {}
- __forceinline float3(const __m128& a) : m128(a) {}
- __forceinline operator const __m128&(void) const { return m128; }
- __forceinline operator __m128&(void) { return m128; }
-
- __forceinline float3(const float3& a) : m128(a.m128) {}
- __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
-#else
- float x, y, z, w;
-#endif
-
- __forceinline float operator[](int i) const { return *(&x + i); }
- __forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float4 {
-#ifdef __KERNEL_SSE__
- union {
- __m128 m128;
- struct { float x, y, z, w; };
- };
-
- __forceinline float4() {}
- __forceinline float4(const __m128& a) : m128(a) {}
- __forceinline operator const __m128&(void) const { return m128; }
- __forceinline operator __m128&(void) { return m128; }
-
- __forceinline float4(const float4& a) : m128(a.m128) {}
- __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
-
-#else
- float x, y, z, w;
-#endif
-
- __forceinline float operator[](int i) const { return *(&x + i); }
- __forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-template<typename T>
-class vector3
-{
-public:
- T x, y, z;
-
- ccl_always_inline vector3() {}
- ccl_always_inline vector3(const T& a)
- : x(a), y(a), z(a) {}
- ccl_always_inline vector3(const T& x, const T& y, const T& z)
- : x(x), y(y), z(z) {}
-};
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-/* Vector Type Constructors
- *
- * OpenCL does not support C++ class, so we use these instead. */
-
-ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
-{
- uchar2 a = {x, y};
- return a;
-}
-
-ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
-{
- uchar3 a = {x, y, z};
- return a;
-}
-
-ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
-{
- uchar4 a = {x, y, z, w};
- return a;
-}
-
-ccl_device_inline int2 make_int2(int x, int y)
-{
- int2 a = {x, y};
- return a;
-}
-
-ccl_device_inline int3 make_int3(int x, int y, int z)
-{
-#ifdef __KERNEL_SSE__
- int3 a;
- a.m128 = _mm_set_epi32(0, z, y, x);
-#else
- int3 a = {x, y, z, 0};
-#endif
-
- return a;
-}
-
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
-{
-#ifdef __KERNEL_SSE__
- int4 a;
- a.m128 = _mm_set_epi32(w, z, y, x);
-#else
- int4 a = {x, y, z, w};
-#endif
-
- return a;
-}
-
-ccl_device_inline uint2 make_uint2(uint x, uint y)
-{
- uint2 a = {x, y};
- return a;
-}
-
-ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
-{
- uint3 a = {x, y, z};
- return a;
-}
-
-ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
-{
- uint4 a = {x, y, z, w};
- return a;
-}
-
-ccl_device_inline float2 make_float2(float x, float y)
-{
- float2 a = {x, y};
- return a;
-}
-
-ccl_device_inline float3 make_float3(float x, float y, float z)
-{
-#ifdef __KERNEL_SSE__
- float3 a;
- a.m128 = _mm_set_ps(0.0f, z, y, x);
-#else
- float3 a = {x, y, z, 0.0f};
-#endif
-
- return a;
-}
-
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
-{
-#ifdef __KERNEL_SSE__
- float4 a;
- a.m128 = _mm_set_ps(w, z, y, x);
-#else
- float4 a = {x, y, z, w};
-#endif
-
- return a;
-}
-
-ccl_device_inline int3 make_int3(int i)
-{
-#ifdef __KERNEL_SSE__
- int3 a;
- a.m128 = _mm_set1_epi32(i);
-#else
- int3 a = {i, i, i, i};
-#endif
-
- return a;
-}
-
-ccl_device_inline int4 make_int4(int i)
-{
-#ifdef __KERNEL_SSE__
- int4 a;
- a.m128 = _mm_set1_epi32(i);
-#else
- int4 a = {i, i, i, i};
-#endif
-
- return a;
-}
-
-ccl_device_inline float3 make_float3(float f)
-{
-#ifdef __KERNEL_SSE__
- float3 a;
- a.m128 = _mm_set1_ps(f);
-#else
- float3 a = {f, f, f, f};
-#endif
-
- return a;
-}
-
-ccl_device_inline float4 make_float4(float f)
-{
-#ifdef __KERNEL_SSE__
- float4 a;
- a.m128 = _mm_set1_ps(f);
-#else
- float4 a = {f, f, f, f};
-#endif
-
- return a;
-}
-
-ccl_device_inline float4 make_float4(const int4& i)
-{
-#ifdef __KERNEL_SSE__
- float4 a;
- a.m128 = _mm_cvtepi32_ps(i.m128);
-#else
- float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
-#endif
-
- return a;
-}
-
-ccl_device_inline int4 make_int4(const float3& f)
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
{
-#ifdef __KERNEL_SSE__
- int4 a;
- a.m128 = _mm_cvtps_epi32(f.m128);
-#else
- int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-#endif
-
- return a;
+ return (offset + alignment - 1) & ~(alignment - 1);
}
-#endif
-
-ccl_device_inline size_t align_up(size_t offset, size_t alignment)
+ccl_device_inline size_t divide_up(size_t x, size_t y)
{
- return (offset + alignment - 1) & ~(alignment - 1);
+ return (x + y - 1) / y;
}
ccl_device_inline size_t round_up(size_t x, size_t multiple)
@@ -509,6 +167,25 @@ enum InterpolationType {
INTERPOLATION_NUM_TYPES,
};
+/* Texture types
+ * Since we store the type in the lower bits of a flat index,
+ * the shift and bit mask constant below need to be kept in sync.
+ */
+
+enum ImageDataType {
+ IMAGE_DATA_TYPE_FLOAT4 = 0,
+ IMAGE_DATA_TYPE_BYTE4 = 1,
+ IMAGE_DATA_TYPE_HALF4 = 2,
+ IMAGE_DATA_TYPE_FLOAT = 3,
+ IMAGE_DATA_TYPE_BYTE = 4,
+ IMAGE_DATA_TYPE_HALF = 5,
+
+ IMAGE_DATA_NUM_TYPES
+};
+
+#define IMAGE_DATA_TYPE_SHIFT 3
+#define IMAGE_DATA_TYPE_MASK 0x7
+
/* Extension types for textures.
*
* Defines how the image is extrapolated past its original bounds.
@@ -554,7 +231,7 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
* ... the compiler optimizes away the temp var */
#ifdef __GNUC__
#define CHECK_TYPE(var, type) { \
- TYPEOF(var) *__tmp; \
+ TYPEOF(var) *__tmp; \
__tmp = (type *)NULL; \
(void)__tmp; \
} (void)0
@@ -576,5 +253,50 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
CCL_NAMESPACE_END
+#ifndef __KERNEL_GPU__
+# include <cassert>
+# define util_assert(statement) assert(statement)
+#else
+# define util_assert(statement)
+#endif
+
+/* Vectorized types declaration. */
+#include "util/util_types_uchar2.h"
+#include "util/util_types_uchar3.h"
+#include "util/util_types_uchar4.h"
+
+#include "util/util_types_int2.h"
+#include "util/util_types_int3.h"
+#include "util/util_types_int4.h"
+
+#include "util/util_types_uint2.h"
+#include "util/util_types_uint3.h"
+#include "util/util_types_uint4.h"
+
+#include "util/util_types_float2.h"
+#include "util/util_types_float3.h"
+#include "util/util_types_float4.h"
+
+#include "util/util_types_vector3.h"
+
+/* Vectorized types implementation. */
+#include "util/util_types_uchar2_impl.h"
+#include "util/util_types_uchar3_impl.h"
+#include "util/util_types_uchar4_impl.h"
+
+#include "util/util_types_int2_impl.h"
+#include "util/util_types_int3_impl.h"
+#include "util/util_types_int4_impl.h"
+
+#include "util/util_types_uint2_impl.h"
+#include "util/util_types_uint3_impl.h"
+#include "util/util_types_uint4_impl.h"
+
+#include "util/util_types_float2_impl.h"
+#include "util/util_types_float3_impl.h"
+#include "util/util_types_float4_impl.h"
+
+#include "util/util_types_vector3_impl.h"
+
#endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h
new file mode 100644
index 00000000000..ec7a1f717a1
--- /dev/null
+++ b/intern/cycles/util/util_types_float2.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_H__
+#define __UTIL_TYPES_FLOAT2_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct float2 {
+ float x, y;
+
+ __forceinline float operator[](int i) const;
+ __forceinline float& operator[](int i);
+};
+
+ccl_device_inline float2 make_float2(float x, float y);
+ccl_device_inline void print_float2(const char *label, const float2& a);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h
new file mode 100644
index 00000000000..782dda195eb
--- /dev/null
+++ b/intern/cycles/util/util_types_float2_impl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
+#define __UTIL_TYPES_FLOAT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline float float2::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+__forceinline float& float2::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+ccl_device_inline float2 make_float2(float x, float y)
+{
+ float2 a = {x, y};
+ return a;
+}
+
+ccl_device_inline void print_float2(const char *label, const float2& a)
+{
+ printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h
new file mode 100644
index 00000000000..28146ad04f7
--- /dev/null
+++ b/intern/cycles/util/util_types_float3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_H__
+#define __UTIL_TYPES_FLOAT3_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) float3 {
+#ifdef __KERNEL_SSE__
+ union {
+ __m128 m128;
+ struct { float x, y, z, w; };
+ };
+
+ __forceinline float3();
+ __forceinline float3(const float3& a);
+ __forceinline explicit float3(const __m128& a);
+
+ __forceinline operator const __m128&(void) const;
+ __forceinline operator __m128&(void);
+
+ __forceinline float3& operator =(const float3& a);
+#else /* __KERNEL_SSE__ */
+ float x, y, z, w;
+#endif /* __KERNEL_SSE__ */
+
+ __forceinline float operator[](int i) const;
+ __forceinline float& operator[](int i);
+};
+
+ccl_device_inline float3 make_float3(float f);
+ccl_device_inline float3 make_float3(float x, float y, float z);
+ccl_device_inline void print_float3(const char *label, const float3& a);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h
new file mode 100644
index 00000000000..45f61767d3f
--- /dev/null
+++ b/intern/cycles/util/util_types_float3_impl.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
+#define __UTIL_TYPES_FLOAT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float3::float3()
+{
+}
+
+__forceinline float3::float3(const float3& a)
+ : m128(a.m128)
+{
+}
+
+__forceinline float3::float3(const __m128& a)
+ : m128(a)
+{
+}
+
+__forceinline float3::operator const __m128&(void) const
+{
+ return m128;
+}
+
+__forceinline float3::operator __m128&(void)
+{
+ return m128;
+}
+
+__forceinline float3& float3::operator =(const float3& a)
+{
+ m128 = a.m128;
+ return *this;
+}
+#endif /* __KERNEL_SSE__ */
+
+__forceinline float float3::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+__forceinline float& float3::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+ccl_device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+ float3 a(_mm_set1_ps(f));
+#else
+ float3 a = {f, f, f, f};
+#endif
+ return a;
+}
+
+ccl_device_inline float3 make_float3(float x, float y, float z)
+{
+#ifdef __KERNEL_SSE__
+ float3 a(_mm_set_ps(0.0f, z, y, x));
+#else
+ float3 a = {x, y, z, 0.0f};
+#endif
+ return a;
+}
+
+ccl_device_inline void print_float3(const char *label, const float3& a)
+{
+ printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h
new file mode 100644
index 00000000000..a7d9abe1b95
--- /dev/null
+++ b/intern/cycles/util/util_types_float4.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_H__
+#define __UTIL_TYPES_FLOAT4_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int4;
+
+struct ccl_try_align(16) float4 {
+#ifdef __KERNEL_SSE__
+ union {
+ __m128 m128;
+ struct { float x, y, z, w; };
+ };
+
+ __forceinline float4();
+ __forceinline float4(const float4& a);
+ __forceinline explicit float4(const __m128& a);
+
+ __forceinline operator const __m128&(void) const;
+ __forceinline operator __m128&(void);
+
+ __forceinline float4& operator =(const float4& a);
+
+#else /* __KERNEL_SSE__ */
+ float x, y, z, w;
+#endif /* __KERNEL_SSE__ */
+
+ __forceinline float operator[](int i) const;
+ __forceinline float& operator[](int i);
+};
+
+ccl_device_inline float4 make_float4(float f);
+ccl_device_inline float4 make_float4(float x, float y, float z, float w);
+ccl_device_inline float4 make_float4(const int4& i);
+ccl_device_inline void print_float4(const char *label, const float4& a);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h
new file mode 100644
index 00000000000..ff3ec4d4ecf
--- /dev/null
+++ b/intern/cycles/util/util_types_float4_impl.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
+#define __UTIL_TYPES_FLOAT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float4::float4()
+{
+}
+
+__forceinline float4::float4(const float4& a)
+ : m128(a.m128)
+{
+}
+
+__forceinline float4::float4(const __m128& a)
+ : m128(a)
+{
+}
+
+__forceinline float4::operator const __m128&(void) const
+{
+ return m128;
+}
+
+__forceinline float4::operator __m128&(void)
+{
+ return m128;
+}
+
+__forceinline float4& float4::operator =(const float4& a)
+{
+ m128 = a.m128;
+ return *this;
+}
+#endif /* __KERNEL_SSE__ */
+
+__forceinline float float4::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+__forceinline float& float4::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+ccl_device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+ float4 a(_mm_set1_ps(f));
+#else
+ float4 a = {f, f, f, f};
+#endif
+ return a;
+}
+
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+{
+#ifdef __KERNEL_SSE__
+ float4 a(_mm_set_ps(w, z, y, x));
+#else
+ float4 a = {x, y, z, w};
+#endif
+ return a;
+}
+
+ccl_device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+ float4 a(_mm_cvtepi32_ps(i.m128));
+#else
+ float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+ return a;
+}
+
+ccl_device_inline void print_float4(const char *label, const float4& a)
+{
+ printf("%s: %.8f %.8f %.8f %.8f\n",
+ label,
+ (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int2.h b/intern/cycles/util/util_types_int2.h
new file mode 100644
index 00000000000..82e860f89eb
--- /dev/null
+++ b/intern/cycles/util/util_types_int2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_H__
+#define __UTIL_TYPES_INT2_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int2 {
+ int x, y;
+
+ __forceinline int operator[](int i) const;
+ __forceinline int& operator[](int i);
+};
+
+ccl_device_inline int2 make_int2(int x, int y);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT2_H__ */
diff --git a/intern/cycles/util/util_types_int2_impl.h b/intern/cycles/util/util_types_int2_impl.h
new file mode 100644
index 00000000000..c7d3942e723
--- /dev/null
+++ b/intern/cycles/util/util_types_int2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_IMPL_H__
+#define __UTIL_TYPES_INT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+int int2::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+int& int2::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+ccl_device_inline int2 make_int2(int x, int y)
+{
+ int2 a = {x, y};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h
new file mode 100644
index 00000000000..9d43b201c02
--- /dev/null
+++ b/intern/cycles/util/util_types_int3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_H__
+#define __UTIL_TYPES_INT3_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) int3 {
+#ifdef __KERNEL_SSE__
+ union {
+ __m128i m128;
+ struct { int x, y, z, w; };
+ };
+
+ __forceinline int3();
+ __forceinline int3(const int3& a);
+ __forceinline explicit int3(const __m128i& a);
+
+ __forceinline operator const __m128i&(void) const;
+ __forceinline operator __m128i&(void);
+
+ __forceinline int3& operator =(const int3& a);
+#else /* __KERNEL_SSE__ */
+ int x, y, z, w;
+#endif /* __KERNEL_SSE__ */
+
+ __forceinline int operator[](int i) const;
+ __forceinline int& operator[](int i);
+};
+
+ccl_device_inline int3 make_int3(int i);
+ccl_device_inline int3 make_int3(int x, int y, int z);
+ccl_device_inline void print_int3(const char *label, const int3& a);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT3_H__ */
diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h
new file mode 100644
index 00000000000..ada50c4812c
--- /dev/null
+++ b/intern/cycles/util/util_types_int3_impl.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_IMPL_H__
+#define __UTIL_TYPES_INT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int3::int3()
+{
+}
+
+__forceinline int3::int3(const __m128i& a)
+ : m128(a)
+{
+}
+
+__forceinline int3::int3(const int3& a)
+ : m128(a.m128)
+{
+}
+
+__forceinline int3::operator const __m128i&(void) const
+{
+ return m128;
+}
+
+__forceinline int3::operator __m128i&(void)
+{
+ return m128;
+}
+
+__forceinline int3& int3::operator =(const int3& a)
+{
+ m128 = a.m128;
+ return *this;
+}
+#endif /* __KERNEL_SSE__ */
+
+__forceinline int int3::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+__forceinline int& int3::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+ccl_device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+ int3 a(_mm_set1_epi32(i));
+#else
+ int3 a = {i, i, i, i};
+#endif
+ return a;
+}
+
+ccl_device_inline int3 make_int3(int x, int y, int z)
+{
+#ifdef __KERNEL_SSE__
+ int3 a(_mm_set_epi32(0, z, y, x));
+#else
+ int3 a = {x, y, z, 0};
+#endif
+
+ return a;
+}
+
+ccl_device_inline void print_int3(const char *label, const int3& a)
+{
+ printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h
new file mode 100644
index 00000000000..cdd0ecbdae5
--- /dev/null
+++ b/intern/cycles/util/util_types_int4.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_H__
+#define __UTIL_TYPES_INT4_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct float3;
+
+struct ccl_try_align(16) int4 {
+#ifdef __KERNEL_SSE__
+ union {
+ __m128i m128;
+ struct { int x, y, z, w; };
+ };
+
+ __forceinline int4();
+ __forceinline int4(const int4& a);
+ __forceinline explicit int4(const __m128i& a);
+
+ __forceinline operator const __m128i&(void) const;
+ __forceinline operator __m128i&(void);
+
+ __forceinline int4& operator=(const int4& a);
+#else /* __KERNEL_SSE__ */
+ int x, y, z, w;
+#endif /* __KERNEL_SSE__ */
+
+ __forceinline int operator[](int i) const;
+ __forceinline int& operator[](int i);
+};
+
+ccl_device_inline int4 make_int4(int i);
+ccl_device_inline int4 make_int4(int x, int y, int z, int w);
+ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline void print_int4(const char *label, const int4& a);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT4_H__ */
diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h
new file mode 100644
index 00000000000..07cdc88f2dc
--- /dev/null
+++ b/intern/cycles/util/util_types_int4_impl.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_IMPL_H__
+#define __UTIL_TYPES_INT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int4::int4()
+{
+}
+
+__forceinline int4::int4(const int4& a)
+ : m128(a.m128)
+{
+}
+
+__forceinline int4::int4(const __m128i& a)
+ : m128(a)
+{
+}
+
+__forceinline int4::operator const __m128i&(void) const
+{
+ return m128;
+}
+
+__forceinline int4::operator __m128i&(void)
+{
+ return m128;
+}
+
+__forceinline int4& int4::operator=(const int4& a)
+{
+ m128 = a.m128;
+ return *this;
+}
+#endif /* __KERNEL_SSE__ */
+
+__forceinline int int4::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+__forceinline int& int4::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+ccl_device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+ int4 a(_mm_set1_epi32(i));
+#else
+ int4 a = {i, i, i, i};
+#endif
+ return a;
+}
+
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+{
+#ifdef __KERNEL_SSE__
+ int4 a(_mm_set_epi32(w, z, y, x));
+#else
+ int4 a = {x, y, z, w};
+#endif
+ return a;
+}
+
+ccl_device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+ int4 a(_mm_cvtps_epi32(f.m128));
+#else
+ int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+ return a;
+}
+
+ccl_device_inline void print_int4(const char *label, const int4& a)
+{
+ printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_INT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar2.h b/intern/cycles/util/util_types_uchar2.h
new file mode 100644
index 00000000000..f618a2234ca
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_H__
+#define __UTIL_TYPES_UCHAR2_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar2 {
+ uchar x, y;
+
+ __forceinline uchar operator[](int i) const;
+ __forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR2_H__ */
diff --git a/intern/cycles/util/util_types_uchar2_impl.h b/intern/cycles/util/util_types_uchar2_impl.h
new file mode 100644
index 00000000000..d5f196d0ce0
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_IMPL_H__
+#define __UTIL_TYPES_UCHAR2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar2::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+uchar& uchar2::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
+{
+ uchar2 a = {x, y};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar3.h b/intern/cycles/util/util_types_uchar3.h
new file mode 100644
index 00000000000..1e3644e6fd6
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_H__
+#define __UTIL_TYPES_UCHAR3_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar3 {
+ uchar x, y, z;
+
+ __forceinline uchar operator[](int i) const;
+ __forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR3_H__ */
diff --git a/intern/cycles/util/util_types_uchar3_impl.h b/intern/cycles/util/util_types_uchar3_impl.h
new file mode 100644
index 00000000000..611021efb7f
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_IMPL_H__
+#define __UTIL_TYPES_UCHAR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar3::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+uchar& uchar3::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
+{
+ uchar3 a = {x, y, z};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar4.h b/intern/cycles/util/util_types_uchar4.h
new file mode 100644
index 00000000000..3802cebbfb9
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_H__
+#define __UTIL_TYPES_UCHAR4_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar4 {
+ uchar x, y, z, w;
+
+ __forceinline uchar operator[](int i) const;
+ __forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR4_H__ */
diff --git a/intern/cycles/util/util_types_uchar4_impl.h b/intern/cycles/util/util_types_uchar4_impl.h
new file mode 100644
index 00000000000..03039f60c54
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_IMPL_H__
+#define __UTIL_TYPES_UCHAR4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar4::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+uchar& uchar4::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 4);
+ return *(&x + i);
+}
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+{
+ uchar4 a = {x, y, z, w};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UCHAR4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint2.h b/intern/cycles/util/util_types_uint2.h
new file mode 100644
index 00000000000..c4a31899614
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_H__
+#define __UTIL_TYPES_UINT2_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint2 {
+ uint x, y;
+
+ __forceinline uint operator[](uint i) const;
+ __forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint2 make_uint2(uint x, uint y);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT2_H__ */
diff --git a/intern/cycles/util/util_types_uint2_impl.h b/intern/cycles/util/util_types_uint2_impl.h
new file mode 100644
index 00000000000..b50ffa2667f
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_IMPL_H__
+#define __UTIL_TYPES_UINT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint2::operator[](uint i) const
+{
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+__forceinline uint& uint2::operator[](uint i)
+{
+ util_assert(i < 2);
+ return *(&x + i);
+}
+
+ccl_device_inline uint2 make_uint2(uint x, uint y)
+{
+ uint2 a = {x, y};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint3.h b/intern/cycles/util/util_types_uint3.h
new file mode 100644
index 00000000000..aeeecd2df06
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_H__
+#define __UTIL_TYPES_UINT3_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint3 {
+ uint x, y, z;
+
+ __forceinline uint operator[](uint i) const;
+ __forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT3_H__ */
diff --git a/intern/cycles/util/util_types_uint3_impl.h b/intern/cycles/util/util_types_uint3_impl.h
new file mode 100644
index 00000000000..26005d5baff
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_IMPL_H__
+#define __UTIL_TYPES_UINT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint3::operator[](uint i) const
+{
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+__forceinline uint& uint3::operator[](uint i)
+{
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
+{
+ uint3 a = {x, y, z};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint4.h b/intern/cycles/util/util_types_uint4.h
new file mode 100644
index 00000000000..2d3a7bb85e4
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_H__
+#define __UTIL_TYPES_UINT4_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint4 {
+ uint x, y, z, w;
+
+ __forceinline uint operator[](uint i) const;
+ __forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT4_H__ */
diff --git a/intern/cycles/util/util_types_uint4_impl.h b/intern/cycles/util/util_types_uint4_impl.h
new file mode 100644
index 00000000000..6d48131a446
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_IMPL_H__
+#define __UTIL_TYPES_UINT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint4::operator[](uint i) const
+{
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+__forceinline uint& uint4::operator[](uint i)
+{
+ util_assert(i < 3);
+ return *(&x + i);
+}
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
+{
+ uint4 a = {x, y, z, w};
+ return a;
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_vector3.h b/intern/cycles/util/util_types_vector3.h
new file mode 100644
index 00000000000..12acf9dc959
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_H__
+#define __UTIL_TYPES_VECTOR3_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+class vector3
+{
+public:
+ T x, y, z;
+
+ __forceinline vector3();
+ __forceinline vector3(const T& a);
+ __forceinline vector3(const T& x, const T& y, const T& z);
+};
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_VECTOR3_H__ */
diff --git a/intern/cycles/util/util_types_vector3_impl.h b/intern/cycles/util/util_types_vector3_impl.h
new file mode 100644
index 00000000000..2f6b8368540
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3_impl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__
+#define __UTIL_TYPES_VECTOR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+ccl_always_inline vector3<T>::vector3()
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& a)
+ : x(a), y(a), z(a)
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& x, const T& y, const T& z)
+ : x(x), y(y), z(z)
+{
+}
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */
diff --git a/intern/dualcon/intern/Projections.h b/intern/dualcon/intern/Projections.h
index 2d1eca55997..2d245a77d15 100644
--- a/intern/dualcon/intern/Projections.h
+++ b/intern/dualcon/intern/Projections.h
@@ -29,7 +29,7 @@
#define CONTAINS_INDEX
#define GRID_DIMENSION 20
-#if defined(_WIN32) && !defined(__MINGW32__) && !(_MSC_VER >= 1900)
+#if defined(_WIN32) && !(_MSC_VER >= 1900)
#define isnan(n) _isnan(n)
#define LONG __int64
#define int64_t __int64
diff --git a/intern/dualcon/intern/dualcon_c_api.cpp b/intern/dualcon/intern/dualcon_c_api.cpp
index e55de2ed354..92f8b0cfd1a 100644
--- a/intern/dualcon/intern/dualcon_c_api.cpp
+++ b/intern/dualcon/intern/dualcon_c_api.cpp
@@ -28,7 +28,7 @@
#include <cstdio>
#include <float.h>
-#if defined(_WIN32) && !defined(__MINGW32__)
+#if defined(_WIN32)
#define isnan(n) _isnan(n)
#endif
diff --git a/intern/elbeem/intern/isosurface.cpp b/intern/elbeem/intern/isosurface.cpp
index fb61fb416b4..de7bfe8e687 100644
--- a/intern/elbeem/intern/isosurface.cpp
+++ b/intern/elbeem/intern/isosurface.cpp
@@ -15,6 +15,7 @@
#include "particletracer.h"
#include <algorithm>
#include <stdio.h>
+#include <cmath>
#ifdef sun
#include "ieeefp.h"
@@ -25,6 +26,8 @@
#define round(x) (x)
#endif
+using std::isfinite;
+
/******************************************************************************
* Constructor
*****************************************************************************/
@@ -937,17 +940,10 @@ void IsoSurface::smoothSurface(float sigma, bool normSmooth)
ew[(j+2)%3]);
}
- // NT important, check this...
-#ifndef WIN32
- if(! finite(cornerareas[i][0]) ) cornerareas[i][0]=1e-6;
- if(! finite(cornerareas[i][1]) ) cornerareas[i][1]=1e-6;
- if(! finite(cornerareas[i][2]) ) cornerareas[i][2]=1e-6;
-#else // WIN32
- // FIXME check as well...
- if(! (cornerareas[i][0]>=0.0) ) cornerareas[i][0]=1e-6;
- if(! (cornerareas[i][1]>=0.0) ) cornerareas[i][1]=1e-6;
- if(! (cornerareas[i][2]>=0.0) ) cornerareas[i][2]=1e-6;
-#endif // WIN32
+ // FIX T50887: ensure pointareas are finite
+ if (!isfinite(cornerareas[i][0])) cornerareas[i][0] = 1e-6;
+ if (!isfinite(cornerareas[i][1])) cornerareas[i][1] = 1e-6;
+ if (!isfinite(cornerareas[i][2])) cornerareas[i][2] = 1e-6;
pointareas[mIndices[i*3+0]] += cornerareas[i][0];
pointareas[mIndices[i*3+1]] += cornerareas[i][1];
@@ -1096,17 +1092,10 @@ void IsoSurface::smoothNormals(float sigma) {
ew[(j+2)%3]);
}
- // NT important, check this...
-#ifndef WIN32
- if(! finite(cornerareas[i][0]) ) cornerareas[i][0]=1e-6;
- if(! finite(cornerareas[i][1]) ) cornerareas[i][1]=1e-6;
- if(! finite(cornerareas[i][2]) ) cornerareas[i][2]=1e-6;
-#else // WIN32
- // FIXME check as well...
- if(! (cornerareas[i][0]>=0.0) ) cornerareas[i][0]=1e-6;
- if(! (cornerareas[i][1]>=0.0) ) cornerareas[i][1]=1e-6;
- if(! (cornerareas[i][2]>=0.0) ) cornerareas[i][2]=1e-6;
-#endif // WIN32
+ // FIX T50887: ensure pointareas are finite
+ if (!isfinite(cornerareas[i][0])) cornerareas[i][0] = 1e-6;
+ if (!isfinite(cornerareas[i][1])) cornerareas[i][1] = 1e-6;
+ if (!isfinite(cornerareas[i][2])) cornerareas[i][2] = 1e-6;
pointareas[mIndices[i*3+0]] += cornerareas[i][0];
pointareas[mIndices[i*3+1]] += cornerareas[i][1];
diff --git a/intern/elbeem/intern/mvmcoords.h b/intern/elbeem/intern/mvmcoords.h
index 56d991aac6e..deeedcf9dd4 100644
--- a/intern/elbeem/intern/mvmcoords.h
+++ b/intern/elbeem/intern/mvmcoords.h
@@ -23,12 +23,10 @@
#define mvmFloat double
#ifdef WIN32
-#ifndef FREE_WINDOWS
#include "float.h"
#define isnan(n) _isnan(n)
#define finite _finite
#endif
-#endif
#ifdef sun
#include "ieeefp.h"
diff --git a/intern/elbeem/intern/solver_util.cpp b/intern/elbeem/intern/solver_util.cpp
index 6eca427c787..f0c7bce2b4e 100644
--- a/intern/elbeem/intern/solver_util.cpp
+++ b/intern/elbeem/intern/solver_util.cpp
@@ -855,6 +855,10 @@ void LbmFsgrSolver::advanceParticles() {
if(k<=mSizez-1-cutval){
CellFlagType pflag = RFLAG(level, i,j,k, workSet);
//errMsg("PIT move"," at "<<PRINT_IJK<<" flag"<<convertCellFlagType2String(pflag) );
+ if (pflag & CFMbndOutflow) {
+ DEL_PART;
+ continue;
+ }
if(pflag & (CFBnd)) {
handleObstacleParticle(p);
continue;
diff --git a/intern/ffmpeg/ffmpeg_compat.h b/intern/ffmpeg/ffmpeg_compat.h
index d6220ebf562..9c06c8a6d67 100644
--- a/intern/ffmpeg/ffmpeg_compat.h
+++ b/intern/ffmpeg/ffmpeg_compat.h
@@ -430,16 +430,11 @@ void av_frame_free(AVFrame **frame)
FFMPEG_INLINE
AVRational av_get_r_frame_rate_compat(const AVStream *stream)
{
- /* Stupid way to distinguish FFmpeg from Libav. */
-#if LIBAVCODEC_VERSION_MICRO >= 100
- return stream->r_frame_rate;
-#else
-# if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(54, 23, 1)
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(54, 23, 1)
/* For until r_frame_rate was deprecated use it. */
return stream->r_frame_rate;
-# else
+#else
return stream->avg_frame_rate;
-# endif
#endif
}
diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h
index 6887063eae9..967d3f58143 100644
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@@ -43,7 +43,7 @@ extern "C" {
* Creates a "handle" for a C++ GHOST object.
* A handle is just an opaque pointer to an empty struct.
* In the API the pointer is casted to the actual C++ class.
- * \param name Name of the handle to create.
+ * The 'name' argument to the macro is the name of the handle to create.
*/
GHOST_DECLARE_HANDLE(GHOST_SystemHandle);
diff --git a/intern/ghost/intern/GHOST_Context.cpp b/intern/ghost/intern/GHOST_Context.cpp
index f69f2181ef7..72db17c4f56 100644
--- a/intern/ghost/intern/GHOST_Context.cpp
+++ b/intern/ghost/intern/GHOST_Context.cpp
@@ -38,7 +38,7 @@
# include <tchar.h>
#
# ifndef ERROR_PROFILE_DOES_NOT_MATCH_DEVICE
-# define ERROR_PROFILE_DOES_NOT_MATCH_DEVICE 0x7E7 // Mingw64 headers may have had this
+# define ERROR_PROFILE_DOES_NOT_MATCH_DEVICE 0x7E7
# endif
#endif
diff --git a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
index 252ea775329..7b9a897fe57 100644
--- a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
+++ b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
@@ -41,9 +41,7 @@
// We do not support multiple monitors at the moment
#define COMPILE_MULTIMON_STUBS
-#ifndef FREE_WINDOWS
#include <multimon.h>
-#endif
GHOST_DisplayManagerWin32::GHOST_DisplayManagerWin32(void)
diff --git a/intern/ghost/intern/GHOST_SystemCocoa.h b/intern/ghost/intern/GHOST_SystemCocoa.h
index b142c2f7194..6802ad42c7b 100644
--- a/intern/ghost/intern/GHOST_SystemCocoa.h
+++ b/intern/ghost/intern/GHOST_SystemCocoa.h
@@ -292,11 +292,6 @@ protected:
/** Ignores window size messages (when window is dragged). */
bool m_ignoreWindowSizedMessages;
- /** Stores the mouse cursor delta due to setting a new cursor position
- * Needed because cocoa event delta cursor move takes setCursorPosition changes too.
- */
- GHOST_TInt32 m_cursorDelta_x, m_cursorDelta_y;
-
/** Temporarily ignore momentum scroll events */
bool m_ignoreMomentumScroll;
/** Is the scroll wheel event generated by a multitouch trackpad or mouse? */
diff --git a/intern/ghost/intern/GHOST_SystemCocoa.mm b/intern/ghost/intern/GHOST_SystemCocoa.mm
index 173f59c9c8f..4582dfb2a49 100644
--- a/intern/ghost/intern/GHOST_SystemCocoa.mm
+++ b/intern/ghost/intern/GHOST_SystemCocoa.mm
@@ -366,8 +366,6 @@ GHOST_SystemCocoa::GHOST_SystemCocoa()
char *rstring = NULL;
m_modifierMask =0;
- m_cursorDelta_x=0;
- m_cursorDelta_y=0;
m_outsideLoopEventProcessed = false;
m_needDelayedApplicationBecomeActiveEventProcessing = false;
m_displayManager = new GHOST_DisplayManagerCocoa ();
@@ -644,6 +642,13 @@ GHOST_TSuccess GHOST_SystemCocoa::setMouseCursorPosition(GHOST_TInt32 x, GHOST_T
CGDisplayMoveCursorToPoint((CGDirectDisplayID)[[[windowScreen deviceDescription] objectForKey:@"NSScreenNumber"] unsignedIntValue], CGPointMake(xf, yf));
+ // See https://stackoverflow.com/a/17559012. By default, hardware events
+ // will be suppressed for 500ms after a synthetic mouse event. For unknown
+ // reasons CGEventSourceSetLocalEventsSuppressionInterval does not work,
+ // however calling CGAssociateMouseAndMouseCursorPosition also removes the
+ // delay, even if this is undocumented.
+ CGAssociateMouseAndMouseCursorPosition(true);
+
[pool drain];
return GHOST_kSuccess;
}
@@ -1354,9 +1359,8 @@ GHOST_TSuccess GHOST_SystemCocoa::handleMouseEvent(void *eventPtr)
case GHOST_kGrabWrap: //Wrap cursor at area/window boundaries
{
NSPoint mousePos = [cocoawindow mouseLocationOutsideOfEventStream];
- GHOST_TInt32 x_mouse= mousePos.x;
- GHOST_TInt32 y_mouse= mousePos.y;
- GHOST_TInt32 x_accum, y_accum, x_cur, y_cur, x, y;
+ GHOST_TInt32 x_mouse = mousePos.x;
+ GHOST_TInt32 y_mouse = mousePos.y;
GHOST_Rect bounds, windowBounds, correctedBounds;
/* fallback to window bounds */
@@ -1370,29 +1374,26 @@ GHOST_TSuccess GHOST_SystemCocoa::handleMouseEvent(void *eventPtr)
correctedBounds.m_b = (windowBounds.m_b - windowBounds.m_t) - correctedBounds.m_b;
correctedBounds.m_t = (windowBounds.m_b - windowBounds.m_t) - correctedBounds.m_t;
- //Update accumulation counts
+ //Get accumulation from previous mouse warps
+ GHOST_TInt32 x_accum, y_accum;
window->getCursorGrabAccum(x_accum, y_accum);
- x_accum += [event deltaX]-m_cursorDelta_x;
- y_accum += -[event deltaY]-m_cursorDelta_y; //Strange Apple implementation (inverted coordinates for the deltaY) ...
- window->setCursorGrabAccum(x_accum, y_accum);
//Warp mouse cursor if needed
- x_mouse += [event deltaX]-m_cursorDelta_x;
- y_mouse += -[event deltaY]-m_cursorDelta_y;
- correctedBounds.wrapPoint(x_mouse, y_mouse, 2);
-
- //Compensate for mouse moved event taking cursor position set into account
- m_cursorDelta_x = x_mouse-mousePos.x;
- m_cursorDelta_y = y_mouse-mousePos.y;
+ GHOST_TInt32 warped_x_mouse = x_mouse;
+ GHOST_TInt32 warped_y_mouse = y_mouse;
+ correctedBounds.wrapPoint(warped_x_mouse, warped_y_mouse, 4);
//Set new cursor position
- window->clientToScreenIntern(x_mouse, y_mouse, x_cur, y_cur);
- setMouseCursorPosition(x_cur, y_cur); /* wrap */
+ if (x_mouse != warped_x_mouse || y_mouse != warped_y_mouse) {
+ GHOST_TInt32 warped_x, warped_y;
+ window->clientToScreenIntern(warped_x_mouse, warped_y_mouse, warped_x, warped_y);
+ setMouseCursorPosition(warped_x, warped_y); /* wrap */
+ window->setCursorGrabAccum(x_accum + (x_mouse - warped_x_mouse), y_accum + (y_mouse - warped_y_mouse));
+ }
- //Post event
- window->getCursorGrabInitPos(x_cur, y_cur);
- window->screenToClientIntern(x_cur, y_cur, x_cur, y_cur);
- window->clientToScreenIntern(x_cur + x_accum, y_cur + y_accum, x, y);
+ //Generate event
+ GHOST_TInt32 x, y;
+ window->clientToScreenIntern(x_mouse + x_accum, y_mouse + y_accum, x, y);
pushEvent(new GHOST_EventCursor([event timestamp] * 1000, GHOST_kEventCursorMove, window, x, y));
break;
}
@@ -1404,9 +1405,6 @@ GHOST_TSuccess GHOST_SystemCocoa::handleMouseEvent(void *eventPtr)
window->clientToScreenIntern(mousePos.x, mousePos.y, x, y);
pushEvent(new GHOST_EventCursor([event timestamp] * 1000, GHOST_kEventCursorMove, window, x, y));
-
- m_cursorDelta_x=0;
- m_cursorDelta_y=0; //Mouse motion occurred between two cursor warps, so we can reset the delta counter
break;
}
}
diff --git a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
index 7d0ce5158fe..8056bc76edb 100644
--- a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
@@ -37,30 +37,6 @@
#include <shlobj.h>
#include "utfconv.h"
-#ifdef __MINGW32__
-
-#if !defined(SHARD_PIDL)
-#define SHARD_PIDL 0x00000001L
-#endif
-
-#if !defined(SHARD_PATHA)
-#define SHARD_PATHA 0x00000002L
-#endif
-
-#if !defined(SHARD_PATHW)
-#define SHARD_PATHW 0x00000003L
-#endif
-
-#if !defined(SHARD_PATH)
-#ifdef UNICODE
-#define SHARD_PATH SHARD_PATHW
-#else
-#define SHARD_PATH SHARD_PATHA
-#endif
-#endif
-
-#endif
-
GHOST_SystemPathsWin32::GHOST_SystemPathsWin32()
{
}
diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp
index 240d7ccd2fe..b0dae432643 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -890,26 +890,7 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const &raw)
// send motion. Mark as 'sent' so motion will always get dispatched.
eventSent = true;
-#if defined(_MSC_VER) || defined(FREE_WINDOWS64)
- // using Microsoft compiler & header files
- // they invented the RawInput API, so this version is (probably) correct.
- // MinGW64 also works fine with this
BYTE const *data = raw.data.hid.bRawData;
- // struct RAWHID {
- // DWORD dwSizeHid;
- // DWORD dwCount;
- // BYTE bRawData[1];
- // };
-#else
- // MinGW's definition (below) doesn't agree, so we need a slight
- // workaround until it's fixed
- BYTE const *data = &raw.data.hid.bRawData;
- // struct RAWHID {
- // DWORD dwSizeHid;
- // DWORD dwCount;
- // BYTE bRawData; // <== isn't this s'posed to be a BYTE*?
- // };
-#endif
BYTE packetType = data[0];
switch (packetType) {
@@ -960,6 +941,8 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
GHOST_ASSERT(system, "GHOST_SystemWin32::s_wndProc(): system not initialized");
if (hwnd) {
+#if 0
+ // Disabled due to bug in Intel drivers, see T51959
if(msg == WM_NCCREATE) {
// Tell Windows to automatically handle scaling of non-client areas
// such as the caption bar. EnableNonClientDpiScaling was introduced in Windows 10
@@ -973,6 +956,7 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
}
}
}
+#endif
GHOST_WindowWin32 *window = (GHOST_WindowWin32 *)::GetWindowLongPtr(hwnd, GWLP_USERDATA);
if (window) {
diff --git a/intern/ghost/intern/GHOST_SystemWin32.h b/intern/ghost/intern/GHOST_SystemWin32.h
index d534a300b35..099d14e68ae 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.h
+++ b/intern/ghost/intern/GHOST_SystemWin32.h
@@ -37,10 +37,10 @@
#error WIN32 only!
#endif // WIN32
-#ifndef __MINGW64__
-# undef _WIN32_WINNT
-# define _WIN32_WINNT 0x501 // require Windows XP or newer
-#endif
+/* require Windows XP or newer */
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x501
+
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <ole2.h> // for drag-n-drop
diff --git a/intern/ghost/intern/GHOST_TaskbarWin32.h b/intern/ghost/intern/GHOST_TaskbarWin32.h
index 6fcff297237..0ef71754717 100644
--- a/intern/ghost/intern/GHOST_TaskbarWin32.h
+++ b/intern/ghost/intern/GHOST_TaskbarWin32.h
@@ -8,10 +8,10 @@
#error WIN32 only!
#endif // WIN32
-#ifndef __MINGW64__
-# undef _WIN32_WINNT
-# define _WIN32_WINNT 0x501 // require Windows XP or newer
-#endif
+/* require Windows XP or newer */
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x501
+
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <shlobj.h>
diff --git a/intern/ghost/intern/GHOST_WindowCocoa.h b/intern/ghost/intern/GHOST_WindowCocoa.h
index b234291396b..5168c48ca2f 100644
--- a/intern/ghost/intern/GHOST_WindowCocoa.h
+++ b/intern/ghost/intern/GHOST_WindowCocoa.h
@@ -56,7 +56,7 @@ public:
* \param systemCocoa The associated system class to forward events to
* \param title The text shown in the title bar of the window.
* \param left The coordinate of the left edge of the window.
- * \param top The coordinate of the top edge of the window.
+ * \param bottom The coordinate of the bottom edge of the window.
* \param width The width the window.
* \param height The height the window.
* \param state The state the window is initially opened with.
diff --git a/intern/ghost/intern/GHOST_WindowCocoa.mm b/intern/ghost/intern/GHOST_WindowCocoa.mm
index 97615dcea96..73c89f9d68d 100644
--- a/intern/ghost/intern/GHOST_WindowCocoa.mm
+++ b/intern/ghost/intern/GHOST_WindowCocoa.mm
@@ -1370,9 +1370,6 @@ GHOST_TSuccess GHOST_WindowCocoa::setWindowCursorGrab(GHOST_TGrabCursorMode mode
//Make window key if it wasn't to get the mouse move events
[m_window makeKeyWindow];
- //Dissociate cursor position even for warp mode, to allow mouse acceleration to work even when warping the cursor
- err = CGAssociateMouseAndMouseCursorPosition(false) == kCGErrorSuccess ? GHOST_kSuccess : GHOST_kFailure;
-
[pool drain];
}
}
@@ -1382,7 +1379,6 @@ GHOST_TSuccess GHOST_WindowCocoa::setWindowCursorGrab(GHOST_TGrabCursorMode mode
setWindowCursorVisibility(true);
}
- err = CGAssociateMouseAndMouseCursorPosition(true) == kCGErrorSuccess ? GHOST_kSuccess : GHOST_kFailure;
/* Almost works without but important otherwise the mouse GHOST location can be incorrect on exit */
setCursorGrabAccum(0, 0);
m_cursorGrabBounds.m_l= m_cursorGrabBounds.m_r= -1; /* disable */
diff --git a/intern/ghost/intern/GHOST_WindowWin32.cpp b/intern/ghost/intern/GHOST_WindowWin32.cpp
index fc46164c135..7ac54e5c915 100644
--- a/intern/ghost/intern/GHOST_WindowWin32.cpp
+++ b/intern/ghost/intern/GHOST_WindowWin32.cpp
@@ -890,19 +890,14 @@ void GHOST_WindowWin32::processWin32TabletEvent(WPARAM wParam, LPARAM lParam)
if (fpWTPacket) {
if (fpWTPacket((HCTX)lParam, wParam, &pkt)) {
if (m_tabletData) {
- switch (pkt.pkCursor) {
- case 0: /* first device */
- case 3: /* second device */
+ switch (pkt.pkCursor % 3) { /* % 3 for multiple devices ("DualTrack") */
+ case 0:
m_tabletData->Active = GHOST_kTabletModeNone; /* puck - not yet supported */
break;
case 1:
- case 4:
- case 7:
m_tabletData->Active = GHOST_kTabletModeStylus; /* stylus */
break;
case 2:
- case 5:
- case 8:
m_tabletData->Active = GHOST_kTabletModeEraser; /* eraser */
break;
}
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index 3f7e462c1c7..a292a2eb5a0 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -89,14 +89,6 @@
#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__)
// Needed for memalign on Linux and _aligned_alloc on Windows.
-# ifdef FREE_WINDOWS
-/* make sure _aligned_malloc is included */
-# ifdef __MSVCRT_VERSION__
-# undef __MSVCRT_VERSION__
-# endif
-
-# define __MSVCRT_VERSION__ 0x0700
-# endif // FREE_WINDOWS
# include <malloc.h>
#else
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index ce8a5b29ece..b4838cdca18 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -64,9 +64,9 @@ enum {
MEMHEAD_ALIGN_FLAG = 2,
};
-#define MEMHEAD_FROM_PTR(ptr) (((MemHead*) vmemh) - 1)
+#define MEMHEAD_FROM_PTR(ptr) (((MemHead*) ptr) - 1)
#define PTR_FROM_MEMHEAD(memhead) (memhead + 1)
-#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) vmemh) - 1)
+#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) ptr) - 1)
#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) MEMHEAD_MMAP_FLAG)
#define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t) MEMHEAD_ALIGN_FLAG)
diff --git a/intern/libmv/CMakeLists.txt b/intern/libmv/CMakeLists.txt
index cd89f1d84b5..b67a23b4159 100644
--- a/intern/libmv/CMakeLists.txt
+++ b/intern/libmv/CMakeLists.txt
@@ -41,9 +41,10 @@ if(WITH_LIBMV)
add_definitions(${GFLAGS_DEFINES})
add_definitions(${GLOG_DEFINES})
add_definitions(${CERES_DEFINES})
+ add_definitions(-DLIBMV_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE})
list(APPEND INC
- ../../extern/gflags/src
+ ${GFLAGS_INCLUDE_DIRS}
../../extern/glog/src
../../extern/ceres/include
../../extern/ceres/config
diff --git a/intern/libmv/ChangeLog b/intern/libmv/ChangeLog
index 45be9c25afa..81096dd90c9 100644
--- a/intern/libmv/ChangeLog
+++ b/intern/libmv/ChangeLog
@@ -1,3 +1,156 @@
+commit efd7a93317e0278b99e66785f667823e451daef1
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Tue May 9 10:16:42 2017 +0200
+
+ Fix strict compiler warnings, unused variables
+
+commit 8efd47e13dfdd3f7209bc96f26d0b13127dd6376
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed Dec 14 10:44:57 2016 +0100
+
+ Fix T50243: libmv_panography_test is broken
+
+ There was fully wrong logic in comparison: was actually accessing memory
+ past the array boundary. Run test manually and the figure seems correct
+ to me now.
+
+ Spotted by @LazyDodo, thanks!
+
+commit 6dfb9cd1bd14669d84be789000ce234747fb00ff
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Thu Jul 14 11:49:38 2016 +0200
+
+ Fix some strict compiler warnings
+
+ One of them was a real bug!
+
+commit f61adaecf7b29ebe6677be0e1c825f0a8d475e4b
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed May 31 11:22:34 2017 +0200
+
+ Enable explicit schur complement for BA step
+
+ This is something we do in Blender and only reason it was not
+ enabled for standalone Libmv is because we did not have fresh
+ enough version of Ceres bundled.
+
+commit fc5d3a1d4880c6658aff693c1c1e8c10c96ce1a7
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed Nov 2 15:32:11 2016 +0100
+
+ Update tests to make tests pass after recent Ceres update
+
+ Just a precision issue, difference is around 1e-7. Should be fine to
+ simply update expected value.
+
+commit e1ac9f6124110c1a90d8e417bea47acfcbdcca42
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed May 31 10:54:48 2017 +0200
+
+ Update Ceres to latest release 1.12.0
+
+commit ac1571352b4962f110929b963f8616d7310ceea5
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Fri Apr 7 17:10:44 2017 +0200
+
+ Fix crash of keyframe selection on 32bit linux
+
+commit 5f8df3da965686df39a6ae5c9f17482075017bf4
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Tue Jan 19 14:00:53 2016 +0500
+
+ Solve some strict warnings in tests
+
+commit 8ea3a5d752a9ce3337ab7643897472a4d33747f1
+Author: Brecht Van Lommel <brechtvanlommel@gmail.com>
+Date: Sat Feb 18 23:52:31 2017 +0100
+
+ Fix a few compiler warnings with macOS / clang.
+
+commit ffbe81461770e70736e80b8cab8e6eb1f8b27160
+Author: Mike Erwin <significant.bit@gmail.com>
+Date: Wed May 31 10:43:08 2017 +0200
+
+ Fix comparison of identicals
+
+ Some of these check that dimensions match before running code that
+ assumes they do match.
+
+ Found with PVS-Studio T48917.
+
+commit 206c01999cde16c1c6c43a8e13ffa86020821d98
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed May 31 10:39:16 2017 +0200
+
+ Add basic track masking API in place
+
+ This brings back ability to mask non-interesting parts of
+ specific track (the feature got lost with new auto-track API).
+
+ Added it back by extending frame accessor class. This isn't really
+ a frame thing, but we don't have other type of accessor here.
+
+ Surely, we can use old-style API here and pass mask via region
+ tracker options for this particular case, but then it becomes much
+ less obvious how real auto-tracker will access this mask with old
+ style API.
+
+ So seems we do need an accessor for such data, just matter of
+ finding better place than frame accessor.
+
+commit faa069cb826892780356477cc10602390fecf06b
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed May 31 10:36:26 2017 +0200
+
+ Tests: Tweak epsilon to avoid what looks a false-positive failure
+
+commit 7c84e45c1d330871477ba3516f57178e5b9d101f
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed May 31 10:15:43 2017 +0200
+
+ CMake: Fix mistake in closing branch
+
+commit cb769a0d319a8c95948153d78a4c3378a0142ece
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Thu Jul 21 12:52:33 2016 +0200
+
+ Set of fixes for MSVC215
+
+ - Move GLOG/GFLAGS defines to a more global scope,
+ this way ANY of our own libraries will use proper
+ declspec.
+
+ - Compile png/zlib/openexif on Windows as well since
+ those are required for a correct linking.
+
+commit bb95c8654fd2cea72d66ed04cd825cc3712ea804
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed Jul 20 18:14:46 2016 +0200
+
+ Disable unexisting Ceres option
+
+ Explicit Schur complement requires having
+ newer Ceres than we currently have bundled.
+
+commit a2e12c959ef32cc9382244d1581992c2f7aa9c09
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date: Wed Jul 20 18:04:57 2016 +0200
+
+ Various fixes for MSVC
+
+ - Update Eigen to 3.2.7 since this brings crucial
+ fixes for MSVC 2015.
+
+ - Switch to STATIC build by default.
+
+ There are issues building current sources as dynamic
+ libraries with MSVC2015 and additionally building
+ dynamic Ceres is not recommended anyway, so let's
+ not do this for the time being.
+
+ If anyone finds a way to make this all working --
+ it'llsurely be a welcome addition.
+
commit 7a676106720fb126a27ff010abdd8bb65d7e0d9a
Author: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon Jan 4 18:30:12 2016 +0500
@@ -365,239 +518,3 @@ Date: Thu May 8 15:50:26 2014 +0200
Reviewed By: sergey
Differential Revision: https://developer.blender.org/D516
-
-commit 4405dff60ea08d454b64da1a7c0595d9328cf8a3
-Author: Keir Mierle <mierle@gmail.com>
-Date: Thu May 8 15:38:14 2014 +0200
-
- Add public SetMarkers to AutoTrack
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D515
-
-commit c90837f6db276a3b1f610eaad509155f6a43b24f
-Author: Keir Mierle <mierle@gmail.com>
-Date: Thu May 8 15:17:48 2014 +0200
-
- Make autotrack skeleton compile
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D514
-
-commit be01baa2e82e36f63e548f073157e68d2ff870c0
-Author: Keir Mierle <mierle@gmail.com>
-Date: Wed May 7 18:48:55 2014 +0200
-
- Add preliminary TrackMarkerToFrame in autotrack
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D509
-
-commit 0cab028d591b3d08672ca86eb6c6e4ac1aacf1d0
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Wed May 7 17:59:11 2014 +0200
-
- Remove assert from ArrayND Resize
-
- That assert broke initialization of arrays which doesn't
- own the data since constructor uses Resize to set shape
- and strides.
-
- Strides are still to be fixed, but that's for later.
-
-commit 64f9c118029a9351e9023e96527c120e1d724d5b
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Wed May 7 17:42:21 2014 +0200
-
- Fix ArrayND freeing the data it doesn't own
-
- Can't really guarantee it works fully correct now,
- but at least this check is needed anyway and compilation
- works just fine.
-
- Reviewers: keir
-
- Reviewed By: keir
-
- Differential Revision: https://developer.blender.org/D508
-
-commit 0618f1c8e88dfc738cdde55784da80b889905e7c
-Author: Keir Mierle <mierle@gmail.com>
-Date: Wed May 7 12:03:32 2014 +0200
-
- Minor changes
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D505
-
-commit 5c34335e1bb90c4ed701ee830c718ed4e20dbffa
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Wed May 7 11:12:23 2014 +0200
-
- Fix compilation error in frame accessor
-
- - int64 is not a standard type, we've got int64_t defined in
- std int. We also have an msvc port of this header, so should
- not be an issue.
-
- - Fixed inconsistency in usage of CacheKey and Key, used Key.
-
- - Some functions weren't marked as virtual.
-
- Additional change: added self to authors.
-
- Reviewers: keir
-
- Reviewed By: keir
-
- Differential Revision: https://developer.blender.org/D504
-
-commit 06bc207614e262cd688e2c3ed820ade7c77bdb66
-Author: Keir Mierle <mierle@gmail.com>
-Date: Tue May 6 22:30:59 2014 +0200
-
- Start new Tracks implementation
-
- This adds the new Tracks implementation, as well as a
- trivial test to show it compiles.
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D502
-
-commit 25ce061e6da69881460ba7718bb0d660a2380a02
-Author: Keir Mierle <mierle@gmail.com>
-Date: Tue May 6 19:10:51 2014 +0200
-
- Add Reconstruction class for new API
-
- This starts the new Reconstruction class (with support for e.g. planes). This
- also starts the new namespace "mv" which will eventually have all the symbols
- we wish to export.
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D501
-
-commit 0a6af3e29016048978aea607673340500e050339
-Author: Keir Mierle <mierle@gmail.com>
-Date: Tue May 6 17:52:53 2014 +0200
-
- Add a new Tracks implementation
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D500
-
-commit 887b68d29c2b198f4939f9ab5153881aa2c1806e
-Author: Keir Mierle <mierle@gmail.com>
-Date: Tue May 6 17:01:39 2014 +0200
-
- Initial commit of unfinished AutoTrack API
-
- This starts the creating the new AutoTrack API. The new API will
- make it possible for libmv to do full autotracking, including
- predictive tracking and also support multiple motion models (3D
- planes etc).
-
- The first goal (not in this patch) is to convert Blender to use
- the new API without adding any new functionality.
-
- Note: This does not add any of the API to the build system!
- It likely does not compile.
-
- Reviewers: sergey
-
- Reviewed By: sergey
-
- Differential Revision: https://developer.blender.org/D499
-
-commit 08cc227d431d257d27f300fbb8e6991e663302da
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Tue May 6 13:09:22 2014 +0200
-
- Fix homography test failure
-
- It was caused by assuming that reconstructed homography matrix
- should look exactly the same as the matrix used to generate a
- test case.
-
- It's not actually valid assumption because different-looking
- matrices could correspond to the same exact transform.
-
- In this change we make it so actual "re-projected" vectors
- are being checked, not the values in matrix. This makes it
- more predictable verification.
-
- Reviewers: keir
-
- Reviewed By: keir
-
- Differential Revision: https://developer.blender.org/D488
-
-commit 0b7d83dc9627447dc7df64d7e3a468aefe9ddc13
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Wed Apr 23 19:14:55 2014 +0600
-
- Fix compilation on OSX after previous commit
-
- EXPECT_EQ wasn't defined in the scope.
-
-commit d14049e00dabf8fdf49056779f0a3718fbb39e8f
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Wed Apr 23 15:08:16 2014 +0600
-
- Move aligned malloc implementation into own file
-
- It was rather stupid having it in brute region tracker,
- now it is in own file in base library (which was also
- added in this commit, before this it consist of header
- files only).
-
- Reviewers: keir
-
- Reviewed By: keir
-
- Differential Revision: https://developer.blender.org/D479
-
-commit 0ddf3851bfcb8de43660b119a25a77a25674200d
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Mon Apr 21 14:14:03 2014 +0600
-
- Optimization of PearsonProductMomentCorrelation
-
- Pass the arrays by reference rather than by value,
- should give some percent of speedup.
-
- Also don't pass the dimensions to the function but
- get them from the images themselves.
-
- Hopefully this will give some %% of tracker speedup.
-
-commit f68fdbe5896a6c5bd8b500caeec61b876c5e44c6
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date: Mon Apr 21 14:10:43 2014 +0600
-
- Fix wrong assert in ResizeImage()
-
- The assert didn't make any sense because ComputeBoundingBox()
- is intended to return bounding box in the following way:
- (xmin, xmax, ymin, ymax).
diff --git a/intern/libmv/bundle.sh b/intern/libmv/bundle.sh
index b1a4be84e53..d155d050782 100755
--- a/intern/libmv/bundle.sh
+++ b/intern/libmv/bundle.sh
@@ -120,9 +120,10 @@ if(WITH_LIBMV)
add_definitions(\${GFLAGS_DEFINES})
add_definitions(\${GLOG_DEFINES})
add_definitions(\${CERES_DEFINES})
+ add_definitions(-DLIBMV_GFLAGS_NAMESPACE=\${GFLAGS_NAMESPACE})
list(APPEND INC
- ../../extern/gflags/src
+ \${GFLAGS_INCLUDE_DIRS}
../../extern/glog/src
../../extern/ceres/include
../../extern/ceres/config
diff --git a/intern/libmv/intern/frame_accessor.cc b/intern/libmv/intern/frame_accessor.cc
index 5d274d7ccca..a741eb88fc7 100644
--- a/intern/libmv/intern/frame_accessor.cc
+++ b/intern/libmv/intern/frame_accessor.cc
@@ -40,10 +40,14 @@ using mv::Region;
struct LibmvFrameAccessor : public FrameAccessor {
LibmvFrameAccessor(libmv_FrameAccessorUserData* user_data,
libmv_GetImageCallback get_image_callback,
- libmv_ReleaseImageCallback release_image_callback)
+ libmv_ReleaseImageCallback release_image_callback,
+ libmv_GetMaskForTrackCallback get_mask_for_track_callback,
+ libmv_ReleaseMaskCallback release_mask_callback)
: user_data_(user_data),
get_image_callback_(get_image_callback),
- release_image_callback_(release_image_callback) { }
+ release_image_callback_(release_image_callback),
+ get_mask_for_track_callback_(get_mask_for_track_callback),
+ release_mask_callback_(release_mask_callback) { }
virtual ~LibmvFrameAccessor() {
}
@@ -109,6 +113,46 @@ struct LibmvFrameAccessor : public FrameAccessor {
release_image_callback_(cache_key);
}
+ Key GetMaskForTrack(int clip,
+ int frame,
+ int track,
+ const Region* region,
+ FloatImage* destination) {
+ float *float_buffer;
+ int width, height;
+ libmv_Region libmv_region;
+ if (region) {
+ get_libmv_region(*region, &libmv_region);
+ }
+ Key cache_key = get_mask_for_track_callback_(
+ user_data_,
+ clip,
+ frame,
+ track,
+ region != NULL ? &libmv_region : NULL,
+ &float_buffer,
+ &width,
+ &height);
+
+ if (cache_key == NULL) {
+ // No mask for the given track.
+ return NULL;
+ }
+
+ // TODO(sergey): Dumb code for until we can set data directly.
+ FloatImage temp_image(float_buffer,
+ height,
+ width,
+ 1);
+ destination->CopyFrom(temp_image);
+
+ return cache_key;
+ }
+
+ void ReleaseMask(Key key) {
+ release_mask_callback_(key);
+ }
+
bool GetClipDimensions(int /*clip*/, int * /*width*/, int * /*height*/) {
return false;
}
@@ -124,6 +168,8 @@ struct LibmvFrameAccessor : public FrameAccessor {
libmv_FrameAccessorUserData* user_data_;
libmv_GetImageCallback get_image_callback_;
libmv_ReleaseImageCallback release_image_callback_;
+ libmv_GetMaskForTrackCallback get_mask_for_track_callback_;
+ libmv_ReleaseMaskCallback release_mask_callback_;
};
} // namespace
@@ -131,11 +177,15 @@ struct LibmvFrameAccessor : public FrameAccessor {
libmv_FrameAccessor* libmv_FrameAccessorNew(
libmv_FrameAccessorUserData* user_data,
libmv_GetImageCallback get_image_callback,
- libmv_ReleaseImageCallback release_image_callback) {
+ libmv_ReleaseImageCallback release_image_callback,
+ libmv_GetMaskForTrackCallback get_mask_for_track_callback,
+ libmv_ReleaseMaskCallback release_mask_callback) {
return (libmv_FrameAccessor*) LIBMV_OBJECT_NEW(LibmvFrameAccessor,
user_data,
get_image_callback,
- release_image_callback);
+ release_image_callback,
+ get_mask_for_track_callback,
+ release_mask_callback);
}
void libmv_FrameAccessorDestroy(libmv_FrameAccessor* frame_accessor) {
diff --git a/intern/libmv/intern/frame_accessor.h b/intern/libmv/intern/frame_accessor.h
index 3e813fe7581..c041d67f56f 100644
--- a/intern/libmv/intern/frame_accessor.h
+++ b/intern/libmv/intern/frame_accessor.h
@@ -61,10 +61,23 @@ typedef libmv_CacheKey (*libmv_GetImageCallback) (
typedef void (*libmv_ReleaseImageCallback) (libmv_CacheKey cache_key);
+typedef libmv_CacheKey (*libmv_GetMaskForTrackCallback) (
+ libmv_FrameAccessorUserData* user_data,
+ int clip,
+ int frame,
+ int track,
+ const libmv_Region* region,
+ float** destination,
+ int* width,
+ int* height);
+typedef void (*libmv_ReleaseMaskCallback) (libmv_CacheKey cache_key);
+
libmv_FrameAccessor* libmv_FrameAccessorNew(
libmv_FrameAccessorUserData* user_data,
libmv_GetImageCallback get_image_callback,
- libmv_ReleaseImageCallback release_image_callback);
+ libmv_ReleaseImageCallback release_image_callback,
+ libmv_GetMaskForTrackCallback get_mask_for_track_callback,
+ libmv_ReleaseMaskCallback release_mask_callback);
void libmv_FrameAccessorDestroy(libmv_FrameAccessor* frame_accessor);
int64_t libmv_frameAccessorgetTransformKey(const libmv_FrameTransform *transform);
diff --git a/intern/libmv/intern/logging.cc b/intern/libmv/intern/logging.cc
index 77b56ef4df3..863832cb72b 100644
--- a/intern/libmv/intern/logging.cc
+++ b/intern/libmv/intern/logging.cc
@@ -29,27 +29,29 @@
#include "libmv/logging/logging.h"
void libmv_initLogging(const char* argv0) {
- // Make it so FATAL messages are always print into console.
+ using LIBMV_GFLAGS_NAMESPACE::SetCommandLineOption;
+ // Make it so ERROR messages are always print into console.
char severity_fatal[32];
snprintf(severity_fatal, sizeof(severity_fatal), "%d",
- google::GLOG_FATAL);
-
+ google::GLOG_ERROR);
google::InitGoogleLogging(argv0);
- gflags::SetCommandLineOption("logtostderr", "1");
- gflags::SetCommandLineOption("v", "0");
- gflags::SetCommandLineOption("stderrthreshold", severity_fatal);
- gflags::SetCommandLineOption("minloglevel", severity_fatal);
+ SetCommandLineOption("logtostderr", "1");
+ SetCommandLineOption("v", "0");
+ SetCommandLineOption("stderrthreshold", severity_fatal);
+ SetCommandLineOption("minloglevel", severity_fatal);
}
void libmv_startDebugLogging(void) {
- gflags::SetCommandLineOption("logtostderr", "1");
- gflags::SetCommandLineOption("v", "2");
- gflags::SetCommandLineOption("stderrthreshold", "1");
- gflags::SetCommandLineOption("minloglevel", "0");
+ using LIBMV_GFLAGS_NAMESPACE::SetCommandLineOption;
+ SetCommandLineOption("logtostderr", "1");
+ SetCommandLineOption("v", "2");
+ SetCommandLineOption("stderrthreshold", "1");
+ SetCommandLineOption("minloglevel", "0");
}
void libmv_setLoggingVerbosity(int verbosity) {
+ using LIBMV_GFLAGS_NAMESPACE::SetCommandLineOption;
char val[10];
snprintf(val, sizeof(val), "%d", verbosity);
- gflags::SetCommandLineOption("v", val);
+ SetCommandLineOption("v", val);
}
diff --git a/intern/libmv/intern/stub.cc b/intern/libmv/intern/stub.cc
index 47e1915e072..8603cc03153 100644
--- a/intern/libmv/intern/stub.cc
+++ b/intern/libmv/intern/stub.cc
@@ -375,7 +375,9 @@ int libmv_autoTrackGetMarker(libmv_AutoTrack* /*libmv_autotrack*/,
libmv_FrameAccessor* libmv_FrameAccessorNew(
libmv_FrameAccessorUserData* /*user_data**/,
libmv_GetImageCallback /*get_image_callback*/,
- libmv_ReleaseImageCallback /*release_image_callback*/)
+ libmv_ReleaseImageCallback /*release_image_callback*/,
+ libmv_GetMaskForTrackCallback /*get_mask_for_track_callback*/,
+ libmv_ReleaseMaskCallback /*release_mask_callback*/)
{
return NULL;
}
diff --git a/intern/libmv/libmv/autotrack/autotrack.cc b/intern/libmv/libmv/autotrack/autotrack.cc
index 4c7bdf1fde8..00366e0f661 100644
--- a/intern/libmv/libmv/autotrack/autotrack.cc
+++ b/intern/libmv/libmv/autotrack/autotrack.cc
@@ -111,6 +111,17 @@ FrameAccessor::Key GetImageForMarker(const Marker& marker,
image);
}
+FrameAccessor::Key GetMaskForMarker(const Marker& marker,
+ FrameAccessor* frame_accessor,
+ FloatImage* mask) {
+ Region region = marker.search_region.Rounded();
+ return frame_accessor->GetMaskForTrack(marker.clip,
+ marker.frame,
+ marker.track,
+ &region,
+ mask);
+}
+
} // namespace
bool AutoTrack::TrackMarker(Marker* tracked_marker,
@@ -149,6 +160,11 @@ bool AutoTrack::TrackMarker(Marker* tracked_marker,
return false;
}
+ FloatImage reference_mask;
+ FrameAccessor::Key reference_mask_key = GetMaskForMarker(reference_marker,
+ frame_accessor_,
+ &reference_mask);
+
FloatImage tracked_image;
FrameAccessor::Key tracked_key = GetImageForMarker(*tracked_marker,
frame_accessor_,
@@ -167,6 +183,10 @@ bool AutoTrack::TrackMarker(Marker* tracked_marker,
if (track_options) {
local_track_region_options = *track_options;
}
+ if (reference_mask_key != NULL) {
+ LG << "Using mask for reference marker: " << reference_marker;
+ local_track_region_options.image1_mask = &reference_mask;
+ }
local_track_region_options.num_extra_points = 1; // For center point.
local_track_region_options.attempt_refine_before_brute = predicted_position;
TrackRegion(reference_image,
@@ -191,9 +211,10 @@ bool AutoTrack::TrackMarker(Marker* tracked_marker,
tracked_marker->reference_clip = reference_marker.clip;
tracked_marker->reference_frame = reference_marker.frame;
- // Release the images from the accessor cache.
+ // Release the images and masks from the accessor cache.
frame_accessor_->ReleaseImage(reference_key);
frame_accessor_->ReleaseImage(tracked_key);
+ frame_accessor_->ReleaseMask(reference_mask_key);
// TODO(keir): Possibly the return here should get removed since the results
// are part of TrackResult. However, eventually the autotrack stuff will have
diff --git a/intern/libmv/libmv/autotrack/frame_accessor.h b/intern/libmv/libmv/autotrack/frame_accessor.h
index 8de5d865cd7..32f6349963c 100644
--- a/intern/libmv/libmv/autotrack/frame_accessor.h
+++ b/intern/libmv/libmv/autotrack/frame_accessor.h
@@ -76,6 +76,25 @@ struct FrameAccessor {
// free the image immediately; others may hold onto the image.
virtual void ReleaseImage(Key) = 0;
+ // Get mask image for the given track.
+ //
+ // Implementation of this method should sample mask associated with the track
+ // within given region to the given destination.
+ //
+ // Result is supposed to be a single channel image.
+ //
+ // If region is NULL, it it assumed to be full-frame.
+ virtual Key GetMaskForTrack(int clip,
+ int frame,
+ int track,
+ const Region* region,
+ FloatImage* destination) = 0;
+
+ // Release a specified mask.
+ //
+ // Non-caching implementation may free used memory immediately.
+ virtual void ReleaseMask(Key key) = 0;
+
virtual bool GetClipDimensions(int clip, int* width, int* height) = 0;
virtual int NumClips() = 0;
virtual int NumFrames(int clip) = 0;
diff --git a/intern/libmv/libmv/autotrack/predict_tracks.cc b/intern/libmv/libmv/autotrack/predict_tracks.cc
index adc986a0033..3786c1b9a3b 100644
--- a/intern/libmv/libmv/autotrack/predict_tracks.cc
+++ b/intern/libmv/libmv/autotrack/predict_tracks.cc
@@ -66,6 +66,7 @@ const double velocity_state_transition_data[] = {
0, 0, 0, 0, 0, 1
};
+#if 0
// This 3rd-order system also models acceleration. This makes for "jerky"
// predictions, but that tend to be more accurate.
const double acceleration_state_transition_data[] = {
@@ -87,6 +88,7 @@ const double angular_state_transition_data[] = {
0, 0, 0, 0, 1, 0, // Velocity y
0, 0, 0, 0, 0, 1 // Ignored
};
+#endif
const double* state_transition_data = velocity_state_transition_data;
diff --git a/intern/locale/CMakeLists.txt b/intern/locale/CMakeLists.txt
index 6896702fcbf..cbc75d1ab1f 100644
--- a/intern/locale/CMakeLists.txt
+++ b/intern/locale/CMakeLists.txt
@@ -60,22 +60,3 @@ if(WITH_INTERNATIONAL)
endif()
blender_add_lib(bf_intern_locale "${SRC}" "${INC}" "${INC_SYS}")
-
-# -----------------------------------------------------------------------------
-# Build msgfmt executable
-
-if(CMAKE_COMPILER_IS_GNUCC)
- # workaroud ld.gold linker bug
- string(REPLACE "-fuse-ld=gold" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-endif()
-
-set(MSFFMT_SRC
- msgfmt.cc
-)
-add_executable(msgfmt ${MSFFMT_SRC})
-
-if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 3.4)))
- # needed for clang 3.4+
- target_link_libraries(msgfmt ${PLATFORM_LINKLIBS})
-endif()
-
diff --git a/intern/locale/msgfmt.cc b/intern/locale/msgfmt.cc
deleted file mode 100644
index 02c58ebc5bc..00000000000
--- a/intern/locale/msgfmt.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Written by Sergey Sharybin <sergey.vfx@gmail.com>
-// Added support for contexts
-//
-// Based on Python script msgfmt.py from Python source
-// code tree, which was written by Written by
-// Martin v. Löwis <loewis@informatik.hu-berlin.de>
-//
-// Generate binary message catalog from textual translation description.
-//
-// This program converts a textual Uniforum-style message catalog (.po file) into
-// a binary GNU catalog (.mo file). This is essentially the same function as the
-// GNU msgfmt program, however, it is a simpler implementation.
-//
-// Usage: msgfmt input.po output.po
-
-#include <algorithm>
-#include <cctype>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-
-namespace {
-
-std::map<std::string, std::string> MESSAGES;
-
-bool starts_with(const std::string &str,
- const std::string &prefix) {
- const size_t prefix_length = prefix.length();
- if (prefix_length == 0) {
- return true;
- }
- // TODO(sergey): Could be optimized if we calculate str.length()
- // to maximum of prefix_length characters.
- if (prefix_length > str.length()) {
- return false;
- } else {
- return str.compare(0, prefix_length, prefix) == 0;
- }
-}
-
-std::string trim(const std::string &str) {
- std::string result = str;
- result.erase(0, result.find_first_not_of(" \t\r\n"));
- result.erase(result.find_last_not_of(" \t\r\n") + 1);
- return result;
-}
-
-std::string unescape(const std::string &str) {
- std::string result;
- const size_t str_length = str.length();
- size_t i = 0;
- while (i < str_length) {
- char current_char = str[i++];
- if (current_char == '\\' && i < str_length - 1) {
- char next_char = str[i++];
- if (next_char == '\\') {
- current_char = '\\';
- } else if (next_char == 'n') {
- current_char = '\n';
- } else if (next_char == 't') {
- current_char = '\t';
- } else {
- current_char = next_char;
- }
- }
- result += current_char;
- }
-
- const size_t result_length = result.length();
- if (result[0] == '"' && result[result_length - 1] == '"') {
- result = result.substr(1, result_length - 2);
- }
-
- return result;
-}
-
-// Add a non-fuzzy translation to the dictionary.
-void add(const std::string &msgctxt,
- const std::string &msgid,
- const std::string &msgstr,
- bool fuzzy) {
- if (fuzzy == false && msgstr.empty() == false) {
- if (msgctxt.empty()) {
- MESSAGES[msgid] = msgstr;
- } else {
- MESSAGES[msgctxt + (char)0x04 + msgid] = msgstr;
- }
- }
-}
-
-template<typename TKey, typename TValue>
-void get_keys(std::map<TKey, TValue> map,
- std::vector<TKey> *keys) {
- keys->reserve(map.size());
- for (typename std::map<TKey, TValue>::iterator it = map.begin();
- it != map.end();
- it++) {
- keys->push_back(it->first);
- }
-}
-
-std::string intToBytes(int value) {
- std::string result;
- for (unsigned int i = 0; i < sizeof(value); i++) {
- result += (unsigned char) ((value >> (i * 8)) & 0xff);
- }
- return result;
-}
-
-typedef enum {
- SECTION_NONE = 0,
- SECTION_CTX = 1,
- SECTION_ID = 2,
- SECTION_STR = 3
-} eSectionType;
-
-struct Offset {
- unsigned int o1, l1, o2, l2;
-};
-
-// Return the generated output.
-std::string generate(void) {
- // The keys are sorted in the .mo file
- std::vector<std::string> keys;
-
- // Get list of sorted keys.
- get_keys(MESSAGES, &keys);
- std::sort(keys.begin(), keys.end());
-
- std::vector<Offset> offsets;
- offsets.reserve(keys.size());
- std::string ids = "", strs = "";
- for (std::vector<std::string>::iterator it = keys.begin();
- it != keys.end();
- it++) {
- std::string &id = *it;
- // For each string, we need size and file offset. Each string is NUL
- // terminated; the NUL does not count into the size.
- Offset offset = {(unsigned int) ids.size(),
- (unsigned int) id.size(),
- (unsigned int) strs.size(),
- (unsigned int) MESSAGES[id].size()};
- offsets.push_back(offset);
- ids += id + '\0';
- strs += MESSAGES[id] + '\0';
- }
-
- // The header is 7 32-bit unsigned integers. We don't use hash tables, so
- // the keys start right after the index tables.
- // translated string.
- int keystart = 7 * 4 + 16 * keys.size();
- // and the values start after the keys
- int valuestart = keystart + ids.size();
- std::vector<int> koffsets;
- std::vector<int> voffsets;
- koffsets.reserve(offsets.size() * 2);
- voffsets.reserve(offsets.size() * 2);
- // The string table first has the list of keys, then the list of values.
- // Each entry has first the size of the string, then the file offset.
- for (std::vector<Offset>::iterator it = offsets.begin();
- it != offsets.end();
- it++) {
- Offset &offset = *it;
- koffsets.push_back(offset.l1);
- koffsets.push_back(offset.o1 + keystart);
- voffsets.push_back(offset.l2);
- voffsets.push_back(offset.o2 + valuestart);
- }
-
- std::vector<int> all_offsets;
- all_offsets.reserve(koffsets.size() + voffsets.size());
- all_offsets.insert(all_offsets.end(), koffsets.begin(), koffsets.end());
- all_offsets.insert(all_offsets.end(), voffsets.begin(), voffsets.end());
-
- std::string output = "";
- output += intToBytes(0x950412de); // Magic
- output += intToBytes(0x0); // Version
- output += intToBytes(keys.size()); // # of entries
- output += intToBytes(7 * 4); // start of key index
- output += intToBytes(7 * 4 + keys.size() * 8); // start of value index
- output += intToBytes(0); // Size of hash table
- output += intToBytes(0); // Offset of hash table
-
- for (std::vector<int>::iterator it = all_offsets.begin();
- it != all_offsets.end();
- it++) {
- int offset = *it;
- output += intToBytes(offset);
- }
-
- output += ids;
- output += strs;
-
- return output;
-}
-
-void make(const char *input_file_name,
- const char *output_file_name) {
- std::map<std::string, std::string> messages;
-
- // Start off assuming Latin-1, so everything decodes without failure,
- // until we know the exact encoding.
- // TODO(sergey): Support encoding.
- // const char *encoding = "latin-1";
-
- eSectionType section = SECTION_NONE;
- bool fuzzy = false;
- bool is_plural = false;
- std::string msgctxt, msgid, msgstr;
-
- std::ifstream input_file_stream(input_file_name);
-
- // Parse the catalog.
- int lno = 0;
- for (std::string l; getline(input_file_stream, l); ) {
- lno++;
- // If we get a comment line after a msgstr, this is a new entry.
- if (l[0] == '#' && section == SECTION_STR) {
- add(msgctxt, msgid, msgstr, fuzzy);
- section = SECTION_NONE;
- msgctxt = "";
- fuzzy = false;
- }
- // Record a fuzzy mark.
- if (starts_with(l, "#,") && l.find("fuzzy") != std::string::npos) {
- fuzzy = true;
- }
- // Skip comments
- if (l[0] == '#') {
- continue;
- }
- // Now we are in a msgid section, output previous section.
- if (starts_with(l, "msgctxt")) {
- if (section == SECTION_STR) {
- add(msgctxt, msgid, msgstr, fuzzy);
- }
- section = SECTION_CTX;
- l = l.substr(7, l.size() - 7);
- msgctxt = msgid = msgstr = "";
- }
- else if (starts_with(l, "msgid") && !starts_with(l, "msgid_plural")) {
- if (section == SECTION_STR) {
- add(msgctxt, msgid, msgstr, fuzzy);
- msgctxt = "";
- if (msgid == "") {
-#if 0
- // See whether there is an encoding declaration.
- p = HeaderParser();
- charset = p.parsestr(msgstr.decode(encoding)).get_content_charset();
- if (charset) {
- encoding = charset;
- }
-#else
- // Not ported to C++ yet.
- std::cerr << "Encoding declarations are not supported yet.\n"
- << std::endl;
- abort();
-#endif
- }
- }
- section = SECTION_ID;
- l = l.substr(5, l.size() - 5);
- msgid = msgstr = "";
- is_plural = false;
- } else if (starts_with(l, "msgid_plural")) {
- // This is a message with plural forms.
- if (section != SECTION_ID) {
- std::cerr << "msgid_plural not preceeded by msgid on"
- << input_file_name << ":"
- << lno
- << std::endl;
- abort();
- }
- l = l.substr(12, l.size() - 12);
- msgid += '\0'; // separator of singular and plural
- is_plural = true;
- } else if (starts_with(l, "msgstr")) {
- // Now we are in a msgstr section
- section = SECTION_STR;
- if (starts_with(l, "msgstr[")) {
- if (is_plural == false) {
- std::cerr << "plural without msgid_plural on "
- << input_file_name << ":"
- << lno
- << std::endl;
- abort();
- }
- int bracket_position = l.find(']');
- if (bracket_position == std::string::npos) {
- std::cerr << "Syntax error on "
- << input_file_name << ":"
- << lno
- << std::endl;
- abort();
- }
- l = l.substr(bracket_position, l.size() - bracket_position);
- if (msgstr != "") {
- msgstr += '\0'; // Separator of the various plural forms;
- }
- } else {
- if (is_plural) {
- std::cerr << "indexed msgstr required for plural on "
- << input_file_name << ":"
- << lno
- << std::endl;
- abort();
- }
- l = l.substr(6, l.size() - 6);
- }
- }
- // Skip empty lines.
- l = trim(l);
- if (l.empty()) {
- if (section == SECTION_STR) {
- add(msgctxt, msgid, msgstr, fuzzy);
- msgctxt = msgid = msgstr = "";
- section = SECTION_NONE;
- fuzzy = false;
- }
- continue;
- }
- l = unescape(l);
- if (section == SECTION_CTX) {
- // TODO(sergey): Support encoding.
- // msgid += l.encode(encoding);
- msgctxt += l;
- }
- else if (section == SECTION_ID) {
- // TODO(sergey): Support encoding.
- // msgid += l.encode(encoding);
- msgid += l;
- } else if (section == SECTION_STR) {
- // TODO(sergey): Support encoding.
- // msgstr += l.encode(encoding)
- msgstr += l;
- } else {
- std::cerr << "Syntax error on "
- << input_file_name << ":"
- << lno
- << std::endl;
- abort();
- }
- // Add last entry
- if (section == SECTION_STR) {
- add(msgctxt, msgid, msgstr, fuzzy);
- }
- }
-
- // Compute output
- std::string output = generate();
-
- std::ofstream output_file_stream(output_file_name,
- std::ios::out | std::ios::binary);
- output_file_stream << output;
-}
-
-} // namespace
-
-int main(int argc, char **argv) {
- if (argc != 3) {
- printf("Usage: %s <input.po> <output.mo>\n", argv[0]);
- return EXIT_FAILURE;
- }
- const char *input_file = argv[1];
- const char *output_file = argv[2];
-
- make(input_file, output_file);
-
- return EXIT_SUCCESS;
-}
diff --git a/intern/memutil/MEM_CacheLimiterC-Api.h b/intern/memutil/MEM_CacheLimiterC-Api.h
index 0fe5469a4d4..b5680890eb8 100644
--- a/intern/memutil/MEM_CacheLimiterC-Api.h
+++ b/intern/memutil/MEM_CacheLimiterC-Api.h
@@ -61,8 +61,8 @@ bool MEM_CacheLimiter_is_disabled(void);
* Create new MEM_CacheLimiter object
* managed objects are destructed with the data_destructor
*
- * @param data_destructor
- * @return A new MEM_CacheLimter object
+ * \param data_destructor
+ * \return A new MEM_CacheLimter object
*/
MEM_CacheLimiterC *new_MEM_CacheLimiter(MEM_CacheLimiter_Destruct_Func data_destructor,
@@ -73,7 +73,7 @@ MEM_CacheLimiterC *new_MEM_CacheLimiter(MEM_CacheLimiter_Destruct_Func data_dest
*
* Frees the memory of the CacheLimiter but does not touch managed objects!
*
- * @param This "This" pointer
+ * \param This "This" pointer
*/
void delete_MEM_CacheLimiter(MEM_CacheLimiterC *This);
@@ -81,8 +81,8 @@ void delete_MEM_CacheLimiter(MEM_CacheLimiterC *This);
/**
* Manage object
*
- * @param This "This" pointer, data data object to manage
- * @return CacheLimiterHandle to ref, unref, touch the managed object
+ * \param This "This" pointer, data data object to manage
+ * \return CacheLimiterHandle to ref, unref, touch the managed object
*/
MEM_CacheLimiterHandleC *MEM_CacheLimiter_insert(MEM_CacheLimiterC *This, void *data);
@@ -90,7 +90,7 @@ MEM_CacheLimiterHandleC *MEM_CacheLimiter_insert(MEM_CacheLimiterC *This, void *
/**
* Free objects until memory constraints are satisfied
*
- * @param This "This" pointer
+ * \param This "This" pointer
*/
void MEM_CacheLimiter_enforce_limits(MEM_CacheLimiterC *This);
@@ -99,7 +99,7 @@ void MEM_CacheLimiter_enforce_limits(MEM_CacheLimiterC *This);
* Unmanage object previously inserted object.
* Does _not_ delete managed object!
*
- * @param This "This" pointer, handle of object
+ * \param handle of object
*/
void MEM_CacheLimiter_unmanage(MEM_CacheLimiterHandleC *handle);
@@ -108,7 +108,7 @@ void MEM_CacheLimiter_unmanage(MEM_CacheLimiterHandleC *handle);
/**
* Raise priority of object (put it at the tail of the deletion chain)
*
- * @param handle of object
+ * \param handle of object
*/
void MEM_CacheLimiter_touch(MEM_CacheLimiterHandleC *handle);
@@ -117,7 +117,7 @@ void MEM_CacheLimiter_touch(MEM_CacheLimiterHandleC *handle);
* Increment reference counter. Objects with reference counter != 0 are _not_
* deleted.
*
- * @param handle of object
+ * \param handle of object
*/
void MEM_CacheLimiter_ref(MEM_CacheLimiterHandleC *handle);
@@ -126,7 +126,7 @@ void MEM_CacheLimiter_ref(MEM_CacheLimiterHandleC *handle);
* Decrement reference counter. Objects with reference counter != 0 are _not_
* deleted.
*
- * @param handle of object
+ * \param handle of object
*/
void MEM_CacheLimiter_unref(MEM_CacheLimiterHandleC *handle);
@@ -134,7 +134,7 @@ void MEM_CacheLimiter_unref(MEM_CacheLimiterHandleC *handle);
/**
* Get reference counter.
*
- * @param handle of object
+ * \param handle of object
*/
int MEM_CacheLimiter_get_refcount(MEM_CacheLimiterHandleC *handle);
@@ -142,7 +142,7 @@ int MEM_CacheLimiter_get_refcount(MEM_CacheLimiterHandleC *handle);
/**
* Get pointer to managed object
*
- * @param handle of object
+ * \param handle of object
*/
void *MEM_CacheLimiter_get(MEM_CacheLimiterHandleC *handle);
diff --git a/intern/opencolorio/CMakeLists.txt b/intern/opencolorio/CMakeLists.txt
index 61a8d995f40..75e228933aa 100644
--- a/intern/opencolorio/CMakeLists.txt
+++ b/intern/opencolorio/CMakeLists.txt
@@ -59,7 +59,7 @@ if(WITH_OPENCOLORIO)
ocio_impl_glsl.cc
)
- if(WIN32 AND NOT MINGW)
+ if(WIN32)
list(APPEND INC_SYS
${BOOST_INCLUDE_DIR}
)
diff --git a/intern/opensubdiv/opensubdiv_capi.cc b/intern/opensubdiv/opensubdiv_capi.cc
index 52ce98fe74b..0a55a432cc6 100644
--- a/intern/opensubdiv/opensubdiv_capi.cc
+++ b/intern/opensubdiv/opensubdiv_capi.cc
@@ -33,6 +33,7 @@
#include <stdlib.h>
#include <GL/glew.h>
+#include <opensubdiv/version.h>
#include <opensubdiv/osd/glMesh.h>
/* CPU Backend */
@@ -74,6 +75,16 @@
#include "MEM_guardedalloc.h"
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+#define STRINGIFY_ARG(x) "" #x
+#define STRINGIFY_APPEND(a, b) "" a #b
+#define STRINGIFY(x) STRINGIFY_APPEND("", x)
+
/* **************** Types declaration **************** */
using OpenSubdiv::Osd::GLMeshInterface;
@@ -146,6 +157,38 @@ typedef Mesh<GLVertexBuffer,
namespace {
+#if !defined(OPENSUBDIV_VERSION_NUMBER) && !defined(OPENSUBDIV_VERSION_MINOR)
+void stringSplit(vector<string>* tokens,
+ const string& str,
+ const string& separators,
+ bool skip_empty) {
+ size_t token_start = 0, token_length = 0;
+ for (size_t i = 0; i < str.length(); ++i) {
+ const char ch = str[i];
+ if (separators.find(ch) == string::npos) {
+ /* Append non-separator char to a token. */
+ ++token_length;
+ } else {
+ /* Append current token to the list (if any). */
+ if (token_length > 0 || !skip_empty) {
+ string token = str.substr(token_start, token_length);
+ tokens->push_back(token);
+ }
+ /* Re-set token pointers, */
+ token_start = i + 1;
+ token_length = 0;
+ }
+ }
+ /* Append token which might be at the end of the string. */
+ if ((token_length != 0) ||
+ (!skip_empty && token_start > 0 &&
+ separators.find(str[token_start-1]) != string::npos)) {
+ string token = str.substr(token_start, token_length);
+ tokens->push_back(token);
+ }
+}
+#endif
+
struct FVarVertex {
float u, v;
void Clear() {
@@ -381,3 +424,30 @@ int openSubdiv_supportGPUDisplay(void)
(GLEW_ARB_texture_buffer_object || GLEW_EXT_texture_buffer_object)));
/* also ARB_explicit_attrib_location? */
}
+
+int openSubdiv_getVersionHex(void)
+{
+#if defined(OPENSUBDIV_VERSION_NUMBER)
+ return OPENSUBDIV_VERSION_NUMBER;
+#elif defined(OPENSUBDIV_VERSION_MAJOR)
+ return OPENSUBDIV_VERSION_MAJOR * 10000 +
+ OPENSUBDIV_VERSION_MINOR * 100 +
+ OPENSUBDIV_VERSION_PATCH;
+#elif defined(OPENSUBDIV_VERSION)
+ const char* version = STRINGIFY(OPENSUBDIV_VERSION);
+ if (version[0] == 'v') {
+ version += 1;
+ }
+ int major = 0, minor = 0, patch = 0;
+ vector<string> tokens;
+ stringSplit(&tokens, version, "_", true);
+ if (tokens.size() == 3) {
+ major = atoi(tokens[0].c_str());
+ minor = atoi(tokens[1].c_str());
+ patch = atoi(tokens[2].c_str());
+ }
+ return major * 10000 + minor * 100 + patch;
+#else
+ return 0;
+#endif
+}
diff --git a/intern/opensubdiv/opensubdiv_capi.h b/intern/opensubdiv/opensubdiv_capi.h
index c3a194813e6..281bd3f010d 100644
--- a/intern/opensubdiv/opensubdiv_capi.h
+++ b/intern/opensubdiv/opensubdiv_capi.h
@@ -152,6 +152,8 @@ void openSubdiv_init(bool gpu_legacy_support);
void openSubdiv_cleanup(void);
bool openSubdiv_gpu_legacy_support(void);
+int openSubdiv_getVersionHex(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/intern/string/STR_HashedString.h b/intern/string/STR_HashedString.h
index 8bfbde65895..ce790f398a0 100644
--- a/intern/string/STR_HashedString.h
+++ b/intern/string/STR_HashedString.h
@@ -38,6 +38,14 @@
#include "STR_String.h"
+/* copied from 'BLI_compiler_attrs.h' */
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
+#define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#else
+#define ATTR_FALLTHROUGH ((void)0)
+#endif
+
// Hash Mix utility function, by Bob Jenkins - Mix 3 32-bit values reversibly
//
@@ -102,16 +110,16 @@ static dword STR_gHash(const void *in, int len, dword init_val)
// Handle the last 11 bytes
c += len;
switch (length) {
- case 11: c += ((dword)p_in[10] << 24);
- case 10: c += ((dword)p_in[9] << 16);
- case 9: c += ((dword)p_in[8] << 8); /* the first byte of c is reserved for the length */
- case 8: b += ((dword)p_in[7] << 24);
- case 7: b += ((dword)p_in[6] << 16);
- case 6: b += ((dword)p_in[5] << 8);
- case 5: b += p_in[4];
- case 4: a += ((dword)p_in[3] << 24);
- case 3: a += ((dword)p_in[2] << 16);
- case 2: a += ((dword)p_in[1] << 8);
+ case 11: c += ((dword)p_in[10] << 24); ATTR_FALLTHROUGH;
+ case 10: c += ((dword)p_in[9] << 16); ATTR_FALLTHROUGH;
+ case 9: c += ((dword)p_in[8] << 8); ATTR_FALLTHROUGH; /* the first byte of c is reserved for the length */
+ case 8: b += ((dword)p_in[7] << 24); ATTR_FALLTHROUGH;
+ case 7: b += ((dword)p_in[6] << 16); ATTR_FALLTHROUGH;
+ case 6: b += ((dword)p_in[5] << 8); ATTR_FALLTHROUGH;
+ case 5: b += p_in[4]; ATTR_FALLTHROUGH;
+ case 4: a += ((dword)p_in[3] << 24); ATTR_FALLTHROUGH;
+ case 3: a += ((dword)p_in[2] << 16); ATTR_FALLTHROUGH;
+ case 2: a += ((dword)p_in[1] << 8); ATTR_FALLTHROUGH;
case 1: a += p_in[0];
}
STR_gHashMix(a, b, c);