Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/CMakeLists.txt150
-rw-r--r--intern/cycles/kernel/closure/bsdf.h79
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h160
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h215
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h164
-rw-r--r--intern/cycles/kernel/closure/bsdf_oren_nayar.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_phong_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_diffuse.h127
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_sheen.h113
-rw-r--r--intern/cycles/kernel/closure/bsdf_toon.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_util.h20
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h40
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h38
-rw-r--r--intern/cycles/kernel/filter/filter_features.h124
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h105
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h186
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h144
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h211
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h117
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h108
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h119
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h105
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h2
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h6
-rw-r--r--intern/cycles/kernel/kernel.h38
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h159
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h12
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h5
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h2
-rw-r--r--intern/cycles/kernel/kernel_globals.h16
-rw-r--r--intern/cycles/kernel/kernel_image_opencl.h63
-rw-r--r--intern/cycles/kernel/kernel_jitter.h21
-rw-r--r--intern/cycles/kernel/kernel_light.h2
-rw-r--r--intern/cycles/kernel/kernel_passes.h217
-rw-r--r--intern/cycles/kernel/kernel_path.h104
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h142
-rw-r--r--intern/cycles/kernel/kernel_path_state.h16
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h25
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h8
-rw-r--r--intern/cycles/kernel/kernel_projection.h3
-rw-r--r--intern/cycles/kernel/kernel_queues.h15
-rw-r--r--intern/cycles/kernel/kernel_random.h238
-rw-r--r--intern/cycles/kernel/kernel_shader.h6
-rw-r--r--intern/cycles/kernel/kernel_shadow.h2
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h58
-rw-r--r--intern/cycles/kernel/kernel_textures.h160
-rw-r--r--intern/cycles/kernel/kernel_types.h206
-rw-r--r--intern/cycles/kernel/kernel_volume.h2
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h138
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h272
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel.cpp42
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx.cpp30
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h5
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h82
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h161
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp29
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu255
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu10
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl280
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl15
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_path_init.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl11
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl27
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split.cl3
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_function.h72
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl11
-rw-r--r--intern/cycles/kernel/osl/osl_bssrdf.cpp31
-rw-r--r--intern/cycles/kernel/osl/osl_closures.cpp268
-rw-r--r--intern/cycles/kernel/osl/osl_closures.h14
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp2
-rw-r--r--intern/cycles/kernel/shaders/CMakeLists.txt4
-rw-r--r--intern/cycles/kernel/shaders/node_principled_bsdf.osl120
-rw-r--r--intern/cycles/kernel/shaders/stdosl.h9
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h220
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h16
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h25
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h66
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h190
-rw-r--r--intern/cycles/kernel/split/kernel_enqueue_inactive.h46
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h123
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_background.h8
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_subsurface.h37
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h222
-rw-r--r--intern/cycles/kernel/split/kernel_queue_enqueue.h3
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h16
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h69
-rw-r--r--intern/cycles/kernel/split/kernel_shader_setup.h70
-rw-r--r--intern/cycles/kernel/split/kernel_shader_sort.h97
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_ao.h42
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h87
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h74
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h18
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h100
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h334
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h343
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h9
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h1
-rw-r--r--intern/cycles/kernel/svm/svm_image.h222
-rw-r--r--intern/cycles/kernel/svm/svm_types.h30
-rw-r--r--intern/cycles/kernel/svm/svm_voxel.h7
132 files changed, 7386 insertions, 1576 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index dbc2ba2503a..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
+ kernels/cpu/kernel_sse2.cpp
+ kernels/cpu/kernel_sse3.cpp
+ kernels/cpu/kernel_sse41.cpp
+ kernels/cpu/kernel_avx.cpp
+ kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
+ kernels/cpu/kernel_split_sse2.cpp
+ kernels/cpu/kernel_split_sse3.cpp
+ kernels/cpu/kernel_split_sse41.cpp
+ kernels/cpu/kernel_split_avx.cpp
+ kernels/cpu/kernel_split_avx2.cpp
+ kernels/cpu/filter.cpp
+ kernels/cpu/filter_sse2.cpp
+ kernels/cpu/filter_sse3.cpp
+ kernels/cpu/filter_sse41.cpp
+ kernels/cpu/filter_avx.cpp
+ kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@@ -21,17 +37,22 @@ set(SRC
kernels/opencl/kernel_lamp_emission.cl
kernels/opencl/kernel_do_volume.cl
kernels/opencl/kernel_indirect_background.cl
+ kernels/opencl/kernel_shader_setup.cl
+ kernels/opencl/kernel_shader_sort.cl
kernels/opencl/kernel_shader_eval.cl
kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
kernels/opencl/kernel_subsurface_scatter.cl
kernels/opencl/kernel_direct_lighting.cl
kernels/opencl/kernel_shadow_blocked_ao.cl
kernels/opencl/kernel_shadow_blocked_dl.cl
+ kernels/opencl/kernel_enqueue_inactive.cl
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
+ kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
+ kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@@ -93,12 +114,18 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
+ kernels/cpu/filter_cpu.h
+ kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_config.h
)
+set(SRC_KERNELS_OPENCL_HEADERS
+ kernels/opencl/kernel_split_function.h
+)
+
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -120,6 +147,8 @@ set(SRC_CLOSURE_HEADERS
closure/bssrdf.h
closure/emissive.h
closure/volume.h
+ closure/bsdf_principled_diffuse.h
+ closure/bsdf_principled_sheen.h
)
set(SRC_SVM_HEADERS
@@ -186,6 +215,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
+set(SRC_FILTER_HEADERS
+ filter/filter.h
+ filter/filter_defines.h
+ filter/filter_features.h
+ filter/filter_features_sse.h
+ filter/filter_kernel.h
+ filter/filter_nlm_cpu.h
+ filter/filter_nlm_gpu.h
+ filter/filter_prefilter.h
+ filter/filter_reconstruction.h
+ filter/filter_transform.h
+ filter/filter_transform_gpu.h
+ filter/filter_transform_sse.h
+)
+
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@@ -194,17 +238,52 @@ set(SRC_UTIL_HEADERS
../util/util_math.h
../util/util_math_fast.h
../util/util_math_intersect.h
+ ../util/util_math_float2.h
+ ../util/util_math_float3.h
+ ../util/util_math_float4.h
+ ../util/util_math_int2.h
+ ../util/util_math_int3.h
+ ../util/util_math_int4.h
+ ../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
../util/util_types.h
+ ../util/util_types_float2.h
+ ../util/util_types_float2_impl.h
+ ../util/util_types_float3.h
+ ../util/util_types_float3_impl.h
+ ../util/util_types_float4.h
+ ../util/util_types_float4_impl.h
+ ../util/util_types_int2.h
+ ../util/util_types_int2_impl.h
+ ../util/util_types_int3.h
+ ../util/util_types_int3_impl.h
+ ../util/util_types_int4.h
+ ../util/util_types_int4_impl.h
+ ../util/util_types_uchar2.h
+ ../util/util_types_uchar2_impl.h
+ ../util/util_types_uchar3.h
+ ../util/util_types_uchar3_impl.h
+ ../util/util_types_uchar4.h
+ ../util/util_types_uchar4_impl.h
+ ../util/util_types_uint2.h
+ ../util/util_types_uint2_impl.h
+ ../util/util_types_uint3.h
+ ../util/util_types_uint3_impl.h
+ ../util/util_types_uint4.h
+ ../util/util_types_uint4_impl.h
+ ../util/util_types_vector3.h
+ ../util/util_types_vector3_impl.h
)
set(SRC_SPLIT_HEADERS
+ split/kernel_branched.h
split/kernel_buffer_update.h
split/kernel_data_init.h
split/kernel_direct_lighting.h
split/kernel_do_volume.h
+ split/kernel_enqueue_inactive.h
split/kernel_holdout_emission_blurring_pathtermination_ao.h
split/kernel_indirect_background.h
split/kernel_indirect_subsurface.h
@@ -213,6 +292,8 @@ set(SRC_SPLIT_HEADERS
split/kernel_path_init.h
split/kernel_queue_enqueue.h
split/kernel_scene_intersect.h
+ split/kernel_shader_setup.h
+ split/kernel_shader_sort.h
split/kernel_shader_eval.h
split/kernel_shadow_blocked_ao.h
split/kernel_shadow_blocked_dl.h
@@ -256,23 +337,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
+ set(cuda_filter_sources kernels/cuda/filter.cu
+ ${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_FILTER_HEADERS}
+ ${SRC_UTIL_HEADERS}
+ )
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
- if(${split})
- set(cuda_extra_flags "-D__SPLIT__")
- set(cuda_cubin kernel_split)
- else()
- set(cuda_extra_flags "")
- set(cuda_cubin kernel)
- endif()
-
+ macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
- set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
- set(cuda_cubin ${cuda_cubin}_experimental)
+ set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+ set(name ${name}_experimental)
endif()
- set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+ set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -286,11 +365,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
- if(split)
- set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
- else()
- set(cuda_kernel_src "/kernels/cuda/kernel.cu")
- endif()
+ set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@@ -304,13 +379,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
- ${cuda_extra_flags}
+ ${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
- DEPENDS ${cuda_sources})
+ DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@@ -324,11 +399,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@@ -349,41 +425,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
- list(APPEND SRC
- kernels/cpu/kernel_sse2.cpp
- kernels/cpu/kernel_sse3.cpp
- kernels/cpu/kernel_sse41.cpp
- kernels/cpu/kernel_split_sse2.cpp
- kernels/cpu/kernel_split_sse3.cpp
- kernels/cpu/kernel_split_sse41.cpp
- )
-
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
- list(APPEND SRC
- kernels/cpu/kernel_avx.cpp
- kernels/cpu/kernel_split_avx.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
- list(APPEND SRC
- kernels/cpu/kernel_avx2.cpp
- kernels/cpu/kernel_split_avx2.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@@ -391,8 +456,10 @@ add_library(cycles_kernel
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_KERNELS_OPENCL_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
+ ${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@@ -422,21 +489,28 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 9139b99353a..86a00d2124d 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -27,6 +27,8 @@
#include "kernel/closure/bsdf_ashikhmin_shirley.h"
#include "kernel/closure/bsdf_toon.h"
#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
#ifdef __SUBSURFACE__
# include "kernel/closure/bssrdf.h"
#endif
@@ -86,16 +88,21 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
@@ -130,6 +137,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -188,14 +206,19 @@ float3 bsdf_eval(KernelGlobals *kg,
eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -222,6 +245,15 @@ float3 bsdf_eval(KernelGlobals *kg,
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -256,14 +288,19 @@ float3 bsdf_eval(KernelGlobals *kg,
eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -290,6 +327,15 @@ float3 bsdf_eval(KernelGlobals *kg,
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
break;
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+ break;
+ case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+ eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+ break;
+#endif /* __PRINCIPLED__ */
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
#ifdef __SVM__
switch(sc->type) {
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
bsdf_microfacet_multi_ggx_blur(sc, roughness);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
bsdf_microfacet_ggx_blur(sc, roughness);
break;
@@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
case CLOSURE_BSDF_REFLECTION_ID:
case CLOSURE_BSDF_REFRACTION_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+ case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
@@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+ case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+ return bsdf_principled_diffuse_merge(a, b);
+#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
return volume_henyey_greenstein_merge(a, b);
@@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+ if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ return true;
+ }
+
+ if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+ return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+ }
+
+ return false;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
- float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} DiffuseBsdf;
/* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float3 *colors;
} DiffuseRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 1c7b3eb9ddd..b12e248f0a3 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -36,7 +36,8 @@
CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct MicrofacetExtra {
- float3 color;
+ float3 color, cspec0;
+ float clearcoat;
} MicrofacetExtra;
typedef ccl_addr_space struct MicrofacetBsdf {
@@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
- float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */
@@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
return normalize(make_float3(-slope_x, -slope_y, 1.0f));
}
+/* Calculate the reflection color
+ *
+ * If fresnel is used, the color is an interpolation of the F0 color and white
+ * with respect to the fresnel
+ *
+ * Else it is simply white
+ */
+ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) {
+ float3 F = make_float3(1.0f, 1.0f, 1.0f);
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+ if(use_fresnel) {
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+
+ F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+ }
+
+ return F;
+}
+
+ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
+{
+ if(alpha >= 1.0f) return M_1_PI_F;
+ float alpha2 = alpha*alpha;
+ float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH;
+ return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
+}
+
/* GGX microfacet with Smith shadow-masking from:
*
* Microfacet Models for Refraction through Rough Surfaces
@@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
-
+
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL;
}
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
{
const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
@@ -273,16 +341,38 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = saturate(bsdf->alpha_y);
-
+
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL;
}
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ bsdf->alpha_x = saturate(bsdf->alpha_x);
+ bsdf->alpha_y = saturate(bsdf->alpha_y);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
{
+ bsdf->extra = NULL;
+
bsdf->alpha_x = saturate(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
@@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
float alpha2 = alpha_x * alpha_y;
float D, G1o, G1i;
+ bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
if(alpha_x == alpha_y) {
/* isotropic
* eq. 20: (F*G*D)/(4*in*on)
@@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
float cosThetaM2 = cosThetaM * cosThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
- D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+ if(is_principled_clearcoat) {
+ /* use GTR1 for clearcoat */
+ D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+ /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+ alpha2 = 0.0625f;
+ }
+ else {
+ /* use GTR2 otherwise */
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+ }
/* eq. 34: now calculate G1(i,m) and G1(o,m) */
G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
@@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
/* eq. 20 */
float common = D * 0.25f / cosNO;
- float out = G * common;
+
+ float3 F = reflection_color(bsdf, omega_in, m);
+ if(is_principled_clearcoat) {
+ F *= 0.25f * bsdf->extra->clearcoat;
+ }
+
+ float3 out = F * G * common;
/* eq. 2 in distribution of visible normals sampling
* pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
@@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
* pdf = pm * 0.25 / dot(m, I); */
*pdf = G1o * common;
- return make_float3(out, out, out);
+ return out;
}
return make_float3(0.0f, 0.0f, 0.0f);
@@ -489,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
/* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
+
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+ || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+ /* if fresnel is used, calculate the color with reflection_color(...) */
+ if(use_fresnel) {
+ *pdf = 1.0f;
+ *eval = reflection_color(bsdf, *omega_in, m);
+ }
+
label = LABEL_REFLECT | LABEL_SINGULAR;
}
else {
@@ -497,16 +617,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
float alpha2 = alpha_x * alpha_y;
float D, G1i;
+ bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
if(alpha_x == alpha_y) {
/* isotropic */
float cosThetaM2 = cosThetaM * cosThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
float tanThetaM2 = 1/(cosThetaM2) - 1;
- D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
/* eval BRDF*cosNI */
float cosNI = dot(N, *omega_in);
+ if(is_principled_clearcoat) {
+ /* use GTR1 for clearcoat */
+ D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+ /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+ alpha2 = 0.0625f;
+
+ /* recalculate G1o */
+ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+ }
+ else {
+ /* use GTR2 otherwise */
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+ }
+
/* eq. 34: now calculate G1(i,m) */
G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
}
@@ -538,10 +674,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
/* see eval function for derivation */
float common = (G1o * D) * 0.25f / cosNO;
- float out = G1i * common;
*pdf = common;
- *eval = make_float3(out, out, out);
+ float3 F = reflection_color(bsdf, *omega_in, m);
+ if(is_principled_clearcoat) {
+ F *= 0.25f * bsdf->extra->clearcoat;
+ }
+
+ *eval = G1i * common * F;
}
#ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 7d87727004f..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
}
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
{
- if(cosI > 0.9999f || cosI < 1e-6f) {
- const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
- const float phi = M_2PI_F * randU.y;
+ if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
+ const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+ const float phi = M_2PI_F * randy;
return make_float2(r*cosf(phi), r*sinf(phi));
}
- const float sinI = sqrtf(1.0f - cosI*cosI);
+ const float sinI = safe_sqrtf(1.0f - cosI*cosI);
const float tanI = sinI/cosI;
const float projA = 0.5f * (cosI + 1.0f);
if(projA < 0.0001f)
return make_float2(0.0f, 0.0f);
- const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+ const float A = 2.0f*randx*projA / cosI - 1.0f;
float tmp = A*A-1.0f;
if(fabsf(tmp) < 1e-7f)
return make_float2(0.0f, 0.0f);
@@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
float U2;
- if(randU.y >= 0.5f)
- U2 = 2.0f*(randU.y - 0.5f);
+ if(randy >= 0.5f)
+ U2 = 2.0f*(randy - 0.5f);
else
- U2 = 2.0f*(0.5f - randU.y);
+ U2 = 2.0f*(0.5f - randy);
const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
- if(randU.y >= 0.5f)
+ if(randy >= 0.5f)
return make_float2(slopeX, slopeY);
else
return make_float2(slopeX, -slopeY);
}
/* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
{
const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
- const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+ const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
return normalize(make_float3(-slope_x, -slope_y, 1.0f));
}
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
{
- if(n && k)
- *weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
return -wi + 2.0f * wm * dot(wi, wm);
}
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
{
if(w.z > 0.9999f)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
else
phase *= D_ggx_aniso(wh, alpha);
- if(n && k) {
- /* Apply conductive fresnel term. */
- return phase * fresnel_conductor(dotW_WH, *n, *k);
- }
-
return make_float3(phase, phase, phase);
}
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
- float3 tm, bm;
- make_orthonormals(wm, &tm, &bm);
-
- float2 disk = concentric_sample_disk(randu, randv);
- return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
- const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
- return make_float3(v, v, v);
-}
-
/* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
{
@@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
return saturate(albedo);
}
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+ if(ior < 1.0f) {
+ ior = 1.0f/ior;
+ }
+ a = saturate(a);
+ ior = clamp(ior, 1.0f, 3.0f);
+ float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+ float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+ float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+ float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+ return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
{
float D = D_ggx(normalize(wi+wo), alpha);
float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+ float multiscatter = wo.z * M_1_PI_F;
+
float albedo = mf_ggx_albedo(alpha);
- return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+ return albedo*singlescatter + (1.0f - albedo)*multiscatter;
}
ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
{
- return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
-}
+ float D = D_ggx_aniso(normalize(wi+wo), alpha);
+ float lambda = mf_lambda(wi, alpha);
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
- return M_1_PI_F * wo.z;
+ float multiscatter = wo.z * M_1_PI_F;
+
+ float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+ return albedo*singlescatter + (1.0f - albedo)*multiscatter;
}
ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
{
- float3 wh;
- float fresnel;
- if(wi.z*wo.z > 0.0f) {
- wh = normalize(wi + wo);
- fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
- }
- else {
- wh = normalize(wi + wo*eta);
- fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
- }
+ bool reflective = (wi.z*wo.z > 0.0f);
+
+ float wh_len;
+ float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
if(wh.z < 0.0f)
wh = -wh;
float3 r_wi = (wi.z < 0.0f)? -wi: wi;
- return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+ float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+ float D = D_ggx(wh, alpha);
+ float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+ float multiscatter = fabsf(wo.z * M_1_PI_F);
+ if(reflective) {
+ float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+ float albedo = mf_ggx_albedo(alpha);
+ return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+ }
+ else {
+ float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+ float albedo = mf_ggx_transmission_albedo(alpha, eta);
+ return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+ }
}
/* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -315,13 +320,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
#define MF_MULTI_GLASS
#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-#endif
-
#define MF_PHASE_FUNCTION glossy
#define MF_MULTI_GLOSSY
#include "kernel/closure/bsdf_microfacet_multi_impl.h"
@@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
bsdf->extra->color.x = saturate(bsdf->extra->color.x);
bsdf->extra->color.y = saturate(bsdf->extra->color.y);
bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-
- bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
}
@@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
if(is_zero(bsdf->T))
bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ if(is_zero(bsdf->T))
+ bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf)
+{
+ bsdf->alpha_y = bsdf->alpha_x;
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
return make_float3(0.0f, 0.0f, 0.0f);
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
float3 X, Y, Z;
Z = bsdf->N;
@@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
else
*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
- return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+ return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
}
ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
*omega_in = 2*dot(Z, I)*Z - I;
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+ *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
return LABEL_REFLECT|LABEL_SINGULAR;
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
if(is_aniso)
make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
@@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
float3 localO;
- *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+ *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
if(is_aniso)
*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
else
@@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
*eval *= *pdf;
*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
@@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
}
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+ bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+ bsdf->alpha_y = bsdf->alpha_x;
+ bsdf->ior = max(0.0f, bsdf->ior);
+ bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+ bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+ bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+ bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+ bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+ bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+ bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
+
+ float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+ float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+ bsdf->sample_weight *= F;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
+}
+
ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
@@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
- return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color);
}
ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
return make_float3(0.0f, 0.0f, 0.0f);
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
float3 X, Y, Z;
Z = bsdf->N;
make_orthonormals(Z, &X, &Y);
@@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
- return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
}
ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
}
}
+ bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
make_orthonormals(Z, &X, &Y);
float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
float3 localO;
- *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+ *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
*eval *= *pdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 8054fa8e849..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,19 +26,16 @@
* the balance heuristic isn't necessarily optimal anymore.
*/
ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
- float3 wi,
- float3 wo,
- const bool wo_outside,
- const float3 color,
- const float alpha_x,
- const float alpha_y,
- ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
- , const float eta
-#elif defined(MF_MULTI_GLOSSY)
- , float3 *n, float3 *k
-#endif
-)
+ float3 wi,
+ float3 wo,
+ const bool wo_outside,
+ const float3 color,
+ const float alpha_x,
+ const float alpha_y,
+ ccl_addr_space uint *lcg_state,
+ const float eta,
+ bool use_fresnel,
+ const float3 cspec0)
{
/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
bool swapped = false;
@@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
/* Analytically compute single scattering for lower noise. */
float3 eval;
+ float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+ const float3 wh = normalize(wi+wo);
#ifdef MF_MULTI_GLASS
eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
if(wo_outside)
eval *= -lambda_r / (shadowing_lambda - lambda_r);
else
eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#elif defined(MF_MULTI_DIFFUSE)
- /* Diffuse has no special closed form for the single scattering bounce */
- eval = make_float3(0.0f, 0.0f, 0.0f);
#else /* MF_MULTI_GLOSSY */
- const float3 wh = normalize(wi+wo);
const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
float val = G2 * 0.25f / wi.z;
if(alpha.x == alpha.y)
val *= D_ggx(wh, alpha.x);
else
val *= D_ggx_aniso(wh, alpha);
- if(n && k) {
- eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
- }
- else {
- eval = make_float3(val, val, val);
- }
+ eval = make_float3(val, val, val);
#endif
+ float F0 = fresnel_dielectric_cos(1.0f, eta);
+ if(use_fresnel) {
+ throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+
+ eval *= throughput;
+ }
+
float3 wr = -wi;
float hr = 1.0f;
float C1_r = 1.0f;
float G1_r = 0.0f;
bool outside = true;
- float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
for(int order = 0; order < 10; order++) {
- /* Sample microfacet height and normal */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+ /* Sample microfacet height. */
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
break;
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
-
-#ifdef MF_MULTI_DIFFUSE
- if(order == 0) {
- /* Compute single-scattering for diffuse. */
- const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
- eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
+ /* Sample microfacet normal. */
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
+
+#ifdef MF_MULTI_GLASS
+ if(order == 0 && use_fresnel) {
+ /* Evaluate amount of scattering towards wo on this microfacet. */
+ float3 phase;
+ if(outside)
+ phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
+ else
+ phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta);
+
+ eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
}
#endif
if(order > 0) {
@@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
else
phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
- phase = mf_eval_phase_diffuse(wo, wm);
#else /* MF_MULTI_GLOSSY */
- phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+ phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
#endif
eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
}
@@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
/* Bounce from the microfacet. */
#ifdef MF_MULTI_GLASS
bool next_outside;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float3 wi_prev = -wr;
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
outside = !outside;
wr = -wr;
hr = -hr;
}
-#elif defined(MF_MULTI_DIFFUSE)
- wr = mf_sample_phase_diffuse(wm,
- lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state));
+
+ if(use_fresnel && !next_outside) {
+ throughput *= color;
+ }
+ else if(use_fresnel && order > 0) {
+ throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+ }
#else /* MF_MULTI_GLOSSY */
- wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+ if(use_fresnel && order > 0) {
+ throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+ }
+ wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
lambda_r = mf_lambda(wr, alpha);
- throughput *= color;
+ if(!use_fresnel)
+ throughput *= color;
C1_r = mf_C1(hr);
G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
* escaped the surface in wo. The function returns the throughput between wi and wo.
* Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
*/
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
- , const float eta
-#elif defined(MF_MULTI_GLOSSY)
- , float3 *n, float3 *k
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+ float3 wi,
+ float3 *wo,
+ const float3 color,
+ const float alpha_x,
+ const float alpha_y,
+ ccl_addr_space uint *lcg_state,
+ const float eta,
+ bool use_fresnel,
+ const float3 cspec0)
{
const float2 alpha = make_float2(alpha_x, alpha_y);
@@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
float G1_r = 0.0f;
bool outside = true;
+ float F0 = fresnel_dielectric_cos(1.0f, eta);
+ if(use_fresnel) {
+ throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
+ }
+
int order;
for(order = 0; order < 10; order++) {
/* Sample microfacet height. */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
/* The random walk has left the surface. */
*wo = outside? wr: -wr;
return throughput;
}
/* Sample microfacet normal. */
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
/* First-bounce color is already accounted for in mix weight. */
- if(order > 0)
+ if(!use_fresnel && order > 0)
throughput *= color;
/* Bounce from the microfacet. */
#ifdef MF_MULTI_GLASS
bool next_outside;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float3 wi_prev = -wr;
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
hr = -hr;
wr = -wr;
outside = !outside;
}
-#elif defined(MF_MULTI_DIFFUSE)
- wr = mf_sample_phase_diffuse(wm,
- lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state));
+
+ if(use_fresnel) {
+ if(!next_outside) {
+ throughput *= color;
+ }
+ else {
+ float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+
+ if(order == 0)
+ throughput = t_color;
+ else
+ throughput *= t_color;
+ }
+ }
#else /* MF_MULTI_GLOSSY */
- wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+ if(use_fresnel) {
+ float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+
+ if(order == 0)
+ throughput = t_color;
+ else
+ throughput *= t_color;
+ }
+ wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
/* Update random walk parameters. */
@@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
}
#undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
#undef MF_MULTI_GLOSSY
#undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float roughness;
float a;
float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
new file mode 100644
index 00000000000..f8ca64293b0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
+#define __BSDF_PRINCIPLED_DIFFUSE_H__
+
+/* DISNEY PRINCIPLED DIFFUSE BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledDiffuseBsdf {
+ SHADER_CLOSURE_BASE;
+
+ float roughness;
+} PrincipledDiffuseBsdf;
+
+ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
+ float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+ float NdotL = max(dot(N, L), 0.0f);
+ float NdotV = max(dot(N, V), 0.0f);
+
+ if(NdotL < 0 || NdotV < 0) {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+
+ float LdotH = dot(L, H);
+
+ float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV);
+ const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness;
+ float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV);
+
+ float value = M_1_PI_F * NdotL * Fd;
+
+ return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
+{
+ bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID;
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+ const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+ const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+ return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+ float3 N = bsdf->N;
+ float3 V = I; // outgoing
+ float3 L = omega_in; // incoming
+ float3 H = normalize(L + V);
+
+ if(dot(N, omega_in) > 0.0f) {
+ *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+ return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf);
+ }
+ else {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
+ float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+ float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+ float3 *domega_in_dy, float *pdf)
+{
+ const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+ float3 N = bsdf->N;
+
+ sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+ if(dot(Ng, *omega_in) > 0) {
+ float3 H = normalize(I + *omega_in);
+
+ *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+ // TODO: find a better approximation for the diffuse bounce
+ *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+ *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+ }
+ else {
+ *pdf = 0.0f;
+ }
+ return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
new file mode 100644
index 00000000000..f4476bfecd0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_SHEEN_H__
+#define __BSDF_PRINCIPLED_SHEEN_H__
+
+/* DISNEY PRINCIPLED SHEEN BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledSheenBsdf {
+ SHADER_CLOSURE_BASE;
+} PrincipledSheenBsdf;
+
+ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
+ float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+ float NdotL = dot(N, L);
+ float NdotV = dot(N, V);
+
+ if(NdotL < 0 || NdotV < 0) {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+
+ float LdotH = dot(L, H);
+
+ float value = schlick_fresnel(LdotH) * NdotL;
+
+ return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+{
+ bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+ return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+ float3 N = bsdf->N;
+ float3 V = I; // outgoing
+ float3 L = omega_in; // incoming
+ float3 H = normalize(L + V);
+
+ if(dot(N, omega_in) > 0.0f) {
+ *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+ return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+ }
+ else {
+ *pdf = 0.0f;
+ return make_float3(0.0f, 0.0f, 0.0f);
+ }
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I,
+ const float3 omega_in, float *pdf)
+{
+ return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
+ float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+ float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+ float3 *domega_in_dy, float *pdf)
+{
+ const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+ float3 N = bsdf->N;
+
+ sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+ if(dot(Ng, *omega_in) > 0) {
+ float3 H = normalize(I + *omega_in);
+
+ *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+ // TODO: find a better approximation for the diffuse bounce
+ *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+ *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+ }
+ else {
+ *pdf = 0.0f;
+ }
+ return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float size;
float smooth;
} ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b0c5280b6cb..3dc15d5791c 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k
return(Rparl2 + Rperp2) * 0.5f;
}
+ccl_device float schlick_fresnel(float u)
+{
+ float m = clamp(1.0f - u, 0.0f, 1.0f);
+ float m2 = m * m;
+ return m2 * m2 * m; // pow(m, 5)
+}
+
ccl_device float smooth_step(float edge0, float edge1, float x)
{
float result;
@@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x)
return result;
}
+/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
+ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) {
+ /* Calculate the fresnel interpolation factor
+ * The value from fresnel_dielectric_cos(...) has to be normalized because
+ * the cspec0 keeps the F0 color
+ */
+ float F0_norm = 1.0f / (1.0f - F0);
+ float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+
+ /* Blend between white and a specular color with respect to the fresnel */
+ return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+}
+
CCL_NAMESPACE_END
#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd861a9..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf {
float d;
float texture_blur;
float albedo;
- float3 N;
+ float roughness;
} Bssrdf;
/* Planar Truncated Gaussian
@@ -360,10 +360,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
{
if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
/* revert to diffuse BSDF if radius too small */
- DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
- bsdf->N = bssrdf->N;
- int flag = bsdf_diffuse_setup(bsdf);
- bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ int flag;
+#ifdef __PRINCIPLED__
+ if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+ float roughness = bssrdf->roughness;
+ float3 N = bssrdf->N;
+ float3 weight = bssrdf->weight;
+ float sample_weight = bssrdf->sample_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf;
+
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+ bsdf->weight = weight;
+ bsdf->sample_weight = sample_weight;
+ flag = bsdf_principled_diffuse_setup(bsdf);
+ bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+ }
+ else
+#endif /* __PRINCIPLED__ */
+ {
+ DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
+ bsdf->N = bssrdf->N;
+ flag = bsdf_diffuse_setup(bsdf);
+ bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ }
+
return flag;
}
else {
@@ -371,7 +393,9 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
bssrdf->sharpness = saturate(bssrdf->sharpness);
bssrdf->type = type;
- if(type == CLOSURE_BSSRDF_BURLEY_ID) {
+ if(type == CLOSURE_BSSRDF_BURLEY_ID ||
+ type == CLOSURE_BSSRDF_PRINCIPLED_ID)
+ {
bssrdf_burley_setup(bssrdf);
}
@@ -385,7 +409,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
bssrdf_cubic_sample(sc, xi, r, h);
else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
bssrdf_gaussian_sample(sc, xi, r, h);
- else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+ else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
bssrdf_burley_sample(sc, xi, r, h);
}
@@ -395,7 +419,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
return bssrdf_cubic_pdf(sc, r);
else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
return bssrdf_gaussian_pdf(sc, r);
- else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+ else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
return bssrdf_burley_pdf(sc, r);
}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+ ccl_global float *buffers[9];
+#else
+ long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *features,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = fabsf(ccl_get_feature(buffer, 0));
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] -= mean[i];
+ }
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *scales,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+ scale[0] = 1.0f/max(scale[0], 0.01f);
+ scale[1] = 1.0f/max(scale[1], 0.01f);
+ scale[2] = 1.0f/max(scale[2], 0.01f);
+ scale[6] = 1.0f/max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+ int pass_stride)
+{
+ return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+ int rank,
+ const ccl_global float *ccl_restrict transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ const ccl_global float *ccl_restrict p_buffer,
+ int2 q_pixel,
+ const ccl_global float *ccl_restrict q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ const ccl_global float *ccl_restrict transform,
+ int stride)
+{
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3185330994c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ __m128 y4 = _mm_set1_ps(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+ __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (pixel.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *features,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = x;
+ features[1] = y;
+ features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_sub_ps(features[i], mean[i]);
+ }
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *scales,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+ scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+ scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
+
+ __m128 diff, scale;
+ diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[3] = _mm_mask_ps(scale, active_pixels);
+
+ scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+ diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+ scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+ scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+ scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+ scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+ scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+ scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+# include "kernel/filter/filter_transform_gpu.h"
+#else
+# ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_transform_sse.h"
+# else
+# include "kernel/filter/filter_transform.h"
+# endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+# include "kernel/filter/filter_nlm_cpu.h"
+#else
+# include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..3e752bce68f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+ const float *ccl_restrict weight_image,
+ const float *ccl_restrict variance_image,
+ float *difference_image,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+#ifdef __KERNEL_SSE3__
+ int aligned_lowx = (rect.x & ~(3));
+ int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+ for(int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+ for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+ _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+ }
+#else
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] += difference_image[y1*w+x];
+ }
+#endif
+ }
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] *= 1.0f/(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ }
+ for(int dx = -f; dx <= f; dx++) {
+ int pos_dx = max(0, dx);
+ int neg_dx = min(0, dx);
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+ out_image[y*w+x] += difference_image[y*w+dx+x];
+ }
+ }
+ }
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict image,
+ float *out_image,
+ float *accum_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+ accum_image[y*w+x] += weight;
+ out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+ int y = fy + filter_rect.y;
+ for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ l_transform, l_rank,
+ weight, l_XtWX, l_XtWY, 0);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+ const float *ccl_restrict accum_image,
+ int4 rect,
+ int w)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] /= accum_image[y*w+x];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..2c5ac807051
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ ccl_global float *difference_image,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int y1 = low; y1 < high; y1++) {
+ sum += difference_image[y1*w+x];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict image,
+ ccl_global float *out_image,
+ ccl_global float *accum_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ if(out_image) {
+ accum_image[y*w+x] += sum;
+ out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+ }
+ else {
+ accum_image[y*w+x] = sum;
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride,
+ int localIdx)
+{
+ int y = fy + filter_rect.y;
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x, y,
+ filter_rect.z*filter_rect.w,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ transform, rank,
+ weight, XtWX, XtWY,
+ localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ ccl_global float *out_image,
+ const ccl_global float *ccl_restrict accum_image,
+ int4 rect, int w)
+{
+ out_image[y*w+x] /= accum_image[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..a0b89c1111f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ int x, int y,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+
+ int offset = tiles->offsets[tile];
+ int stride = tiles->strides[tile];
+ const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+ center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample+1)/2;
+ int even_sample = sample/2;
+ if(use_split_variance) {
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ }
+ varA /= max(odd_sample - 1, 1);
+ varB /= max(even_sample - 1, 1);
+
+ sampleVariance[idx] = 0.5f*(varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset, int v_offset,
+ int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 rect, int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+ ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ mean[idx] = center_buffer[m_offset] / sample;
+ if (sample > 1) {
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *out,
+ int4 rect,
+ int pass_stride)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ int n = 0;
+ float values[25];
+ for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+ for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+ int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ /* Find the position of L. */
+ int i;
+ for(i = 0; i < n; i++) {
+ if(values[i] > L) break;
+ }
+ /* Make space for L by shifting all following values to the right. */
+ for(int j = n; j > i; j--) {
+ values[j] = values[j-1];
+ }
+ /* Insert L. */
+ values[i] = L;
+ n++;
+ }
+ }
+
+ int idx = (y-rect.y)*buffer_w + (x-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ float ref = 2.0f*values[(int)(n*0.75f)];
+ float fac = 1.0f;
+ if(L > ref) {
+ /* The pixel appears to be an outlier.
+ * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+ * should actually be at the reference value:
+ * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+ * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+ */
+ float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+ if(L - 3*stddev < ref) {
+ /* The pixel is an outlier, so negate the depth value to mark it as one.
+ * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+ depth[idx] = -depth[idx];
+ fac = ref/L;
+ variance[idx ] *= fac*fac;
+ variance[idx + pass_stride] *= fac*fac;
+ variance[idx+2*pass_stride] *= fac*fac;
+ }
+ }
+ out[idx ] = fac*image[idx];
+ out[idx + pass_stride] = fac*image[idx + pass_stride];
+ out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 rect, int r)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
+ if(variance) {
+ if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+ for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+ int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+ values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for(int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for(j = i-1; j >= 0 && values[j] > v; j--)
+ values[j+1] = values[j];
+ values[j+1] = v;
+ }
+ variance[idx] = values[(7*numValues)/8];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..25a3025056c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ int storage_stride,
+ int dx, int dy,
+ int w, int h,
+ int pass_stride,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ float weight,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int localIdx)
+{
+ if(weight < 1e-3f) {
+ return;
+ }
+
+ int p_offset = y *w + x;
+ int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+ float design_row[DENOISE_FEATURES+1];
+#endif
+
+ float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+ /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+ if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+ return;
+ }
+
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
+
+ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ int storage_stride,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 buffer_params,
+ int sample)
+{
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+ if(XtWX[0] < 1e-3f) {
+ /* There is not enough information to determine a denoised result.
+ * As a fallback, keep the original value of the pixel. */
+ return;
+ }
+
+ /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+ * In case the solution of the linear model fails due to numerical issues,
+ * fall back to this value. */
+ float3 mean_color = XtWY[0]/XtWX[0];
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+ float3 final_color = XtWY[0];
+ if(!isfinite3_safe(final_color)) {
+ final_color = mean_color;
+ }
+
+ /* Clamp pixel value to positive values. */
+ final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ final_color *= sample;
+ if(buffer_params.w) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ /* Temporary storage, used in different steps of the algorithm. */
+ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ float tempvector[2*DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float *feature_scale = tempvector;
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float* feature_matrix = tempmatrix;
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ }
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int radius, float pca_threshold,
+ int transform_stride, int localIdx)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+ float features[DENOISE_FEATURES];
+#endif
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+ const ccl_global float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int j = 0; j < (*rank); j++) {
+ transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..30dc2969b11
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ __m128 features[DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ __m128 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ }
+
+ __m128 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale);
+
+ __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 8888000f0e6..5c3b0ee3c15 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I
r_ext = mw_extension + r_curr;
#ifdef __KERNEL_SSE__
const float3 p_curr_sq = p_curr * p_curr;
- const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
+ const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
float d = dxxx.x;
#else
float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
- return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+ float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+ return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..9ed16aceb55 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -220,8 +220,16 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
#ifdef __SHADOW_TRICKS__
L->path_total = make_float3(0.0f, 0.0f, 0.0f);
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
- L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_throughput = 0.0f;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_depth = 0.0f;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +285,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
- float3 ao,
- int bounce)
+ float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@@ -302,31 +310,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf;
+ L->path_total += light;
+ L->path_total_shaded += ao * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf;
#endif
}
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ BsdfEval *bsdf_eval,
+ float3 shadow,
+ float shadow_fac,
+ bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +372,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf_eval->sum_no_mis;
+ L->path_total += light;
+ L->path_total_shaded += shadow * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf_eval->sum_no_mis;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf_eval->sum_no_mis;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@@ -393,11 +419,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * value;
- if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
- L->path_total_shaded += throughput * value;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * value;
+ if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+ L->path_total_shaded += throughput * value;
+ }
}
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_albedo += state->denoising_feature_weight * value;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,29 +587,79 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+ kernel_assert(L->use_light_pass);
+
+ *clean = L->emission + L->background;
+ *noisy = L->direct_scatter + L->indirect_scatter;
+
+# define ADD_COMPONENT(flag, component) \
+ if(kernel_data.film.denoising_flags & flag) \
+ *clean += component; \
+ else \
+ *noisy += component;
+
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
+# undef ADD_COMPONENT
+#else
+ *noisy = L->emission;
+ *clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+ *noisy = ensure_finite3(*noisy);
+ *clean = ensure_finite3(*clean);
+}
+
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
+#ifdef __SPLIT_KERNEL__
+# define safe_float3_add(f, v) \
+ do { \
+ ccl_global float *p = (ccl_global float*)(&(f)); \
+ atomic_add_and_fetch_float(p+0, (v).x); \
+ atomic_add_and_fetch_float(p+1, (v).y); \
+ atomic_add_and_fetch_float(p+2, (v).z); \
+ } while(0)
+#else
+# define safe_float3_add(f, v) (f) += (v)
+#endif /* __SPLIT_KERNEL__ */
+
#ifdef __PASSES__
- L->direct_diffuse += L_sample->direct_diffuse*fac;
- L->direct_glossy += L_sample->direct_glossy*fac;
- L->direct_transmission += L_sample->direct_transmission*fac;
- L->direct_subsurface += L_sample->direct_subsurface*fac;
- L->direct_scatter += L_sample->direct_scatter*fac;
-
- L->indirect_diffuse += L_sample->indirect_diffuse*fac;
- L->indirect_glossy += L_sample->indirect_glossy*fac;
- L->indirect_transmission += L_sample->indirect_transmission*fac;
- L->indirect_subsurface += L_sample->indirect_subsurface*fac;
- L->indirect_scatter += L_sample->indirect_scatter*fac;
-
- L->background += L_sample->background*fac;
- L->ao += L_sample->ao*fac;
- L->shadow += L_sample->shadow*fac;
+ safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
+ safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
+ safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
+ safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
+ safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
+
+ safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
+ safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
+ safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
+ safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
+ safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
+
+ safe_float3_add(L->background, L_sample->background*fac);
+ safe_float3_add(L->ao, L_sample->ao*fac);
+ safe_float3_add(L->shadow, L_sample->shadow*fac);
+# ifdef __SPLIT_KERNEL__
+ atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
+# else
L->mist += L_sample->mist*fac;
-#endif
- L->emission += L_sample->emission * fac;
+# endif /* __SPLIT_KERNEL__ */
+#endif /* __PASSES__ */
+ safe_float3_add(L->emission, L_sample->emission*fac);
+
+#undef safe_float3_add
}
#ifdef __SHADOW_TRICKS__
@@ -595,16 +677,17 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
- ccl_addr_space float* L_transparent)
+ float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
- *L_transparent = shadow;
- L_sum = make_float3(0.0f, 0.0f, 0.0f);
+ *alpha = 1.0f - L->shadow_throughput * shadow;
+ L_sum = L->shadow_radiance_sum;
}
else {
- L_sum = L->shadow_color * shadow;
+ L_sum = L->shadow_background_color * L->shadow_throughput * shadow +
+ L->shadow_radiance_sum;
}
return L_sum;
}
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..93934ee6b38 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -195,7 +195,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
ix = wrap_clamp(ix, width);
iy = wrap_clamp(iy, height);
@@ -222,7 +222,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
nix = wrap_clamp(ix+1, width);
niy = wrap_clamp(iy+1, height);
@@ -265,7 +265,7 @@ template<typename T> struct texture_image {
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
pix = wrap_clamp(ix-1, width);
piy = wrap_clamp(iy-1, height);
@@ -335,7 +335,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
ix = wrap_clamp(ix, width);
iy = wrap_clamp(iy, height);
@@ -374,7 +374,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
nix = wrap_clamp(ix+1, width);
niy = wrap_clamp(iy+1, height);
@@ -449,7 +449,7 @@ template<typename T> struct texture_image {
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
- /* Fall through. */
+ ATTR_FALLTHROUGH;
case EXTENSION_EXTEND:
pix = wrap_clamp(ix-1, width);
piy = wrap_clamp(iy-1, height);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..38708f7ff0b 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,11 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..4836c290312 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
+#define ATTR_FALLTHROUGH
+
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index c9c97ea977e..f95f0d98c52 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -19,6 +19,10 @@
#ifndef __KERNEL_GLOBALS_H__
#define __KERNEL_GLOBALS_H__
+#ifdef __KERNEL_CPU__
+# include "util/util_vector.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -38,12 +42,12 @@ struct Intersection;
struct VolumeStep;
typedef struct KernelGlobals {
- texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
- texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
- texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU];
- texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
- texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];
- texture_image_half texture_half_images[TEX_NUM_HALF_CPU];
+ vector<texture_image_float4> texture_float4_images;
+ vector<texture_image_uchar4> texture_byte4_images;
+ vector<texture_image_half4> texture_half4_images;
+ vector<texture_image_float> texture_float_images;
+ vector<texture_image_uchar> texture_byte_images;
+ vector<texture_image_half> texture_half_images;
# define KERNEL_TEX(type, ttype, name) ttype name;
# define KERNEL_IMAGE_TEX(type, ttype, name)
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 0352c58037d..90747e09357 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -20,18 +20,19 @@
ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
{
+ const int texture_type = kernel_tex_type(id);
/* Float4 */
- if(id < TEX_START_BYTE4_OPENCL) {
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
return kernel_tex_fetch(__tex_image_float4_packed, offset);
}
/* Byte4 */
- else if(id < TEX_START_FLOAT_OPENCL) {
+ else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
float f = 1.0f/255.0f;
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
}
/* Float */
- else if(id < TEX_START_BYTE_OPENCL) {
+ else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
float f = kernel_tex_fetch(__tex_image_float_packed, offset);
return make_float4(f, f, f, 1.0f);
}
@@ -63,23 +64,34 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
return x - (float)i;
}
+ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+{
+ return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+}
+
+ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+{
+ if(info.w & (1 << 1)) {
+ return EXTENSION_REPEAT;
+ }
+ else if(info.w & (1 << 2)) {
+ return EXTENSION_EXTEND;
+ }
+ else {
+ return EXTENSION_CLIP;
+ }
+}
+
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
{
uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
uint width = info.x;
uint height = info.y;
uint offset = info.z;
-
- /* Image Options */
- uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
- uint extension;
- if(info.w & (1 << 1))
- extension = EXTENSION_REPEAT;
- else if(info.w & (1 << 2))
- extension = EXTENSION_EXTEND;
- else
- extension = EXTENSION_CLIP;
-
+ /* Decode image options. */
+ uint interpolation = kernel_decode_image_interpolation(info);
+ uint extension = kernel_decode_image_extension(info);
+ /* Actual sampling. */
float4 r;
int ix, iy, nix, niy;
if(interpolation == INTERPOLATION_CLOSEST) {
@@ -132,7 +144,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
}
-
return r;
}
@@ -144,17 +155,10 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
uint height = info.y;
uint offset = info.z;
uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
-
- /* Image Options */
- uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
- uint extension;
- if(info.w & (1 << 1))
- extension = EXTENSION_REPEAT;
- else if(info.w & (1 << 2))
- extension = EXTENSION_EXTEND;
- else
- extension = EXTENSION_CLIP;
-
+ /* Decode image options. */
+ uint interpolation = kernel_decode_image_interpolation(info);
+ uint extension = kernel_decode_image_extension(info);
+ /* Actual sampling. */
float4 r;
int ix, iy, iz, nix, niy, niz;
if(interpolation == INTERPOLATION_CLOSEST) {
@@ -171,7 +175,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
if(extension == EXTENSION_CLIP) {
if(x < 0.0f || y < 0.0f || z < 0.0f ||
x > 1.0f || y > 1.0f || z > 1.0f)
- {
+ {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
}
@@ -198,12 +202,13 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
niz = svm_image_texture_wrap_periodic(iz+1, depth);
}
else {
- if(extension == EXTENSION_CLIP)
+ if(extension == EXTENSION_CLIP) {
if(x < 0.0f || y < 0.0f || z < 0.0f ||
x > 1.0f || y > 1.0f || z > 1.0f)
{
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
+ }
/* Fall through. */
/* EXTENSION_EXTEND */
nix = svm_image_texture_wrap_clamp(ix+1, width);
@@ -224,8 +229,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
-
}
-
return r;
}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 67546131746..f5855757d3f 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p)
return (x + jx)*invN;
}
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
+ccl_device_inline int cmj_isqrt(int value)
{
- kernel_assert(s < N);
-
#if defined(__KERNEL_CUDA__)
- int m = float_to_int(__fsqrt_ru(N));
+ return float_to_int(__fsqrt_ru(value));
+#elif defined(__KERNEL_GPU__)
+ return float_to_int(sqrtf(value));
#else
- int m = float_to_int(sqrtf(N));
+ /* This is a work around for fast-math on CPU which might replace sqrtf()
+ * with am approximated version.
+ */
+ return float_to_int(sqrtf(value) + 1e-6f);
#endif
+}
+
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+{
+ kernel_assert(s < N);
+
+ int m = cmj_isqrt(N);
int n = (N - 1)/m + 1;
float invN = 1.0f/N;
float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
- float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+ float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..9cd7ffb181d 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,140 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+ kernel_write_pass_float(buffer, sample, value);
+
+ /* The online one-pass variance update that's used for the megakernel can't easily be implemented
+ * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+1, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float(buffer+1, sample, 0.0f);
+ }
+ else {
+ float new_mean = buffer[0] * (1.0f / (sample + 1));
+ float old_mean = (buffer[0] - value) * (1.0f / sample);
+ kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+# if defined(__SPLIT_KERNEL__)
+# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+# else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+ buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+ buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+ buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+# endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+ kernel_write_pass_float3_unaligned(buffer, sample, value);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ else {
+ float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+ float3 new_mean = sum * (1.0f / (sample + 1));
+ float3 old_mean = (sum - value) * (1.0f / sample);
+ kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, float path_total, float path_total_shaded)
+{
+ if(kernel_data.film.pass_denoising_data == 0)
+ return;
+
+ buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+ path_total = ensure_finite(path_total);
+ path_total_shaded = ensure_finite(path_total_shaded);
+
+ kernel_write_pass_float(buffer, sample/2, path_total);
+ kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+ float value = path_total_shaded / max(path_total, 1e-7f);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+2, sample/2, value*value);
+# else
+ if(sample < 2) {
+ kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+ }
+ else {
+ float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+ float new_value = buffer[1] / max(buffer[0], 1e-7f);
+ kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+ }
+# endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+ ShaderData *sd,
+ ccl_addr_space PathState *state,
+ PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight == 0.0f) {
+ return;
+ }
+
+ L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+ /* Skip implicitly transparent surfaces. */
+ if(sd->flag & SD_HAS_ONLY_VOLUME) {
+ return;
+ }
+
+ float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+ float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+ float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ continue;
+
+ /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+ normal += sc->N * sc->sample_weight;
+ sum_weight += sc->sample_weight;
+ if(!bsdf_is_specular_like(sc)) {
+ albedo += sc->weight;
+ sum_nonspecular_weight += sc->sample_weight;
+ }
+ }
+
+ /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+ if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+ if(sum_weight != 0.0f) {
+ normal /= sum_weight;
+ }
+ L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+ L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+ state->denoising_feature_weight = 0.0f;
+ }
+#else
+ (void) kg;
+ (void) sd;
+ (void) state;
+ (void) L;
+#endif /* __DENOISING_FEATURES__ */
+}
+
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@@ -199,5 +333,88 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+ if(L) {
+ float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ L_sum = path_radiance_clamp_and_sum(kg, L);
+ }
+
+ kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+ kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+# ifdef __SHADOW_TRICKS__
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+# else
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+# endif
+ if(kernel_data.film.pass_denoising_clean) {
+ float3 noisy, clean;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ noisy = L_sum;
+ clean = make_float3(0.0f, 0.0f, 0.0f);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ path_radiance_split_denoising(kg, L, &noisy, &clean);
+ }
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, noisy);
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, clean);
+ }
+ else {
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, ensure_finite3(L_sum));
+ }
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, L->denoising_normal);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, L->denoising_albedo);
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, L->denoising_depth);
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+ else {
+ kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, 0.0f);
+
+ if(kernel_data.film.pass_denoising_clean) {
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index e7957042182..c340b3bc968 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
- PathState *state,
+ ccl_addr_space PathState *state,
RNG *rng,
float3 throughput,
float3 ao_alpha)
@@ -90,14 +90,16 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
+#ifndef __SPLIT_KERNEL__
+
ccl_device void kernel_path_indirect(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
@@ -364,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, sd, state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -403,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
if(kernel_data.integrator.use_direct_light) {
int all = (kernel_data.integrator.sample_all_lights_indirect) ||
(state->flag & PATH_RAY_SHADOW_CATCHER);
@@ -417,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
L,
all);
}
-#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif /* defined(__EMISSION__) */
if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
break;
@@ -425,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -515,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@@ -547,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -557,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@@ -575,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -589,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+ kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -621,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -638,11 +643,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_background_color =
+ indirect_background(kg, &emission_sd, &state, &ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
}
else {
@@ -675,7 +685,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@@ -693,7 +703,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -713,10 +723,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+ kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -727,7 +739,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
- &L,
+ L,
&state,
rng,
&ray,
@@ -740,15 +752,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
- kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
- if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
- kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+ kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@@ -758,7 +770,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
- &L,
+ L,
&throughput);
}
else {
@@ -767,24 +779,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -805,18 +808,21 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
+#endif /* __SPLIT_KERNEL__ */
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 36fd6c95fe7..77d4f1df447 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
- PathState *state,
+ ccl_addr_space PathState *state,
RNG *rng,
float3 throughput)
{
@@ -56,29 +56,48 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
}
+#ifndef __SPLIT_KERNEL__
/* bounce off surface and integrate indirect light */
ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
- if(!CLOSURE_IS_BSDF(sc->type))
- continue;
/* transparency is not handled here, but in outer loop */
- if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
+ }
int num_samples;
@@ -110,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
- &bsdf_ray))
+ &bsdf_ray,
+ sum_sample_weight))
{
continue;
}
@@ -242,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -329,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@@ -337,11 +362,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
float num_samples_inv = 1.0f/num_samples;
for(int j = 0; j < num_samples; j++) {
- /* workaround to fix correlation bug in T38710, can find better solution
- * in random number generator later, for now this is done here to not impact
- * performance of rendering without volumes */
- RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
PathState ps = state;
Ray pray = ray;
float3 tp = throughput;
@@ -352,8 +372,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* scatter sample. if we use distance sampling and take just one
* sample for direct and indirect light, we could share this
* computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
@@ -366,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -377,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -411,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+ kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -435,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@@ -466,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -479,13 +499,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
- state.catcher_object = sd.object;
- if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
- }
+ state.flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
+ state.catcher_object = sd.object;
+ if(!kernel_data.background.transparent) {
+ L->shadow_background_color =
+ indirect_background(kg, &emission_sd, &state, &ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
else {
state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
@@ -513,13 +536,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -543,10 +566,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+ kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@@ -554,7 +579,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
- &L, &state, rng, &ray, throughput);
+ L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -567,13 +592,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
- &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+ &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
- &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+ &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -602,24 +627,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -640,20 +656,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
+#endif /* __SPLIT_KERNEL__ */
+
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..5d92fd12201 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+ state->denoising_feature_weight = 1.0f;
+ }
+ else {
+ state->denoising_feature_weight = 0.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@@ -128,6 +138,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __DENOISING_FEATURES__
+ if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+ state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+ }
+#endif
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 076c82f3853..dcb577e176f 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
/* branched path tracing: connect path directly to position on one or more lights and add it to L */
ccl_device_noinline void kernel_branched_path_surface_connect_light(
KernelGlobals *kg,
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
- Ray *ray)
+ ccl_addr_space Ray *ray,
+ float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+#ifdef __DENOISING_FEATURES__
+ state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
/* modify path state */
path_state_next(kg, state, label);
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
+ if(is_zero(dir))
+ return make_float2(0.0f, 0.0f);
+
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 96bc636d5ac..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index(
return my_gqidx;
}
+ccl_device int dequeue_ray_index(
+ int queue_number,
+ ccl_global int *queues,
+ int queue_size,
+ ccl_global int *queue_index)
+{
+ int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+ if(index < 0) {
+ return QUEUE_EMPTY_SLOT;
+ }
+
+ return queues[index + queue_number * queue_size];
+}
+
CCL_NAMESPACE_END
#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index d4f0caff5de..e8a912ccc0b 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -20,14 +20,15 @@ CCL_NAMESPACE_BEGIN
#ifdef __SOBOL__
-/* skip initial numbers that are not as well distributed, especially the
+/* Skip initial numbers that are not as well distributed, especially the
* first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination */
+ * path termination.
+ */
#define SOBOL_SKIP 64
-/* High Dimensional Sobol */
+/* High Dimensional Sobol. */
-/* van der corput radical inverse */
+/* Van der Corput radical inverse. */
ccl_device uint van_der_corput(uint bits)
{
bits = (bits << 16) | (bits >> 16);
@@ -38,58 +39,63 @@ ccl_device uint van_der_corput(uint bits)
return bits;
}
-/* sobol radical inverse */
+/* Sobol radical inverse. */
ccl_device uint sobol(uint i)
{
uint r = 0;
-
- for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
- if(i & 1)
+ for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) {
+ if(i & 1) {
r ^= v;
-
+ }
+ }
return r;
}
-/* inverse of sobol radical inverse */
+/* Inverse of sobol radical inverse. */
ccl_device uint sobol_inverse(uint i)
{
const uint msb = 1U << 31;
uint r = 0;
-
- for(uint v = 1; i; i <<= 1, v ^= v << 1)
- if(i & msb)
+ for(uint v = 1; i; i <<= 1, v ^= v << 1) {
+ if(i & msb) {
r ^= v;
-
+ }
+ }
return r;
}
-/* multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
+/* Multidimensional sobol with generator matrices
+ * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively.
+ */
ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
uint i = index;
-
- for(uint j = 0; i; i >>= 1, j++)
- if(i & 1)
+ for(uint j = 0; i; i >>= 1, j++) {
+ if(i & 1) {
result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
-
+ }
+ }
return result;
}
-/* lookup index and x/y coordinate, assumes m is a power of two */
-ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
+/* Lookup index and x/y coordinate, assumes m is a power of two. */
+ccl_device uint sobol_lookup(const uint m,
+ const uint frame,
+ const uint ex,
+ const uint ey,
+ uint *x, uint *y)
{
- /* shift is constant per frame */
+ /* Shift is constant per frame. */
const uint shift = frame << (m << 1);
const uint sobol_shift = sobol(shift);
- /* van der Corput is its own inverse */
+ /* Van der Corput is its own inverse. */
const uint lower = van_der_corput(ex << (32 - m));
- /* need to compensate for ey difference and shift */
+ /* Need to compensate for ey difference and shift. */
const uint sobol_lower = sobol(lower);
- const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
+ const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */
const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
- /* only use m upper bits for the index (m is a power of two) */
+ /* Only use m upper bits for the index (m is a power of two). */
const uint sobol_result = delta | (delta >> m);
const uint upper = sobol_inverse(sobol_result);
const uint index = shift | upper | lower;
@@ -98,11 +104,14 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
return index;
}
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
- /* correlated multi-jittered */
+ /* Correlated multi-jitter. */
int p = *rng + dimension;
return cmj_sample_1D(sample, num_samples, p);
}
@@ -113,7 +122,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
return r;
#else
- /* compute sobol sequence value using direction vectors */
+ /* Compute sobol sequence value using direction vectors. */
uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
@@ -130,24 +139,33 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
#endif
}
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension,
+ float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
- /* correlated multi-jittered */
+ /* Correlated multi-jitter. */
int p = *rng + dimension;
cmj_sample_2D(sample, num_samples, p, fx, fy);
}
else
#endif
{
- /* sobol */
+ /* Sobol. */
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
}
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ int sample, int num_samples,
+ RNG *rng,
+ int x, int y,
+ float *fx, float *fy)
{
#ifdef __SOBOL_FULL_SCREEN__
uint px, py;
@@ -182,29 +200,43 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta
#endif
}
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ RNG rng)
{
/* nothing to do */
}
-#else
+#else /* __SOBOL__ */
/* Linear Congruential Generator */
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension)
{
/* implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return (float)*rng * (1.0f/(float)0xFFFFFFFF);
}
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ int sample, int num_samples,
+ int dimension,
+ float *fx, float *fy)
{
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
-ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device void path_rng_init(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ int sample, int num_samples,
+ RNG *rng,
+ int x, int y,
+ float *fx, float *fy)
{
/* load state */
*rng = *rng_state;
@@ -220,13 +252,15 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int
}
}
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ RNG rng)
{
/* store state for next sample */
*rng_state = rng;
}
-#endif
+#endif /* __SOBOL__ */
/* Linear Congruential Generator */
@@ -257,49 +291,108 @@ ccl_device uint lcg_init(uint seed)
* dimension to avoid using the same sequence twice.
*
* For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly. */
+ * in a sequence and offset accordingly.
+ */
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension)
{
- return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
+ return path_rng_1D(kg,
+ rng,
+ state->sample, state->num_samples,
+ state->rng_offset + dimension);
}
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension)
{
- /* the rng_offset is not increased for transparent bounces. if we do then
+ /* The rng_offset is not increased for transparent bounces. if we do then
* fully transparent objects can become subtly visible by the different
* sampling patterns used where the transparent object is.
*
* however for some random numbers that will determine if we next bounce
* is transparent we do need to increase the offset to avoid always making
- * the same decision */
- int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
- return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+ * the same decision. */
+ const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg,
+ rng,
+ state->sample, state->num_samples,
+ rng_offset + dimension);
}
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int dimension,
+ float *fx, float *fy)
{
- path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
+ path_rng_2D(kg,
+ rng,
+ state->sample, state->num_samples,
+ state->rng_offset + dimension,
+ fx, fy);
}
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension)
{
- return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
+ return path_rng_1D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ state->rng_offset + dimension);
}
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension)
{
- int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
- return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+ const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ rng_offset + dimension);
}
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches,
+ int dimension,
+ float *fx, float *fy)
{
- path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
+ path_rng_2D(kg,
+ rng,
+ state->sample * num_branches + branch,
+ state->num_samples * num_branches,
+ state->rng_offset + dimension,
+ fx, fy);
}
-/* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state)
+/* Utitility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
@@ -307,15 +400,27 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG
return 0.0f;
}
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(
+ KernelGlobals *kg,
+ RNG *rng,
+ const ccl_addr_space PathState *state,
+ int branch,
+ int num_branches)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+ return path_branched_rng_1D_for_decision(kg,
+ rng,
+ state,
+ branch,
+ num_branches,
+ PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+ int branch,
+ int num_branches)
{
/* path is splitting into a branch, adjust so that each branch
* still gets a unique sample from the same sequence */
@@ -324,14 +429,17 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int br
state->num_samples = state->num_samples*num_branches;
}
-ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng,
+ int rng_offset,
+ int sample,
+ uint scramble)
{
return lcg_init(*rng + rng_offset + sample*scramble);
}
ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
{
- /* implicit mod 2^32 */
+ /* Implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return (float)*rng * (1.0f/(float)0xFFFFFFFF);
}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index db6f839d9ed..fab5946970d 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -379,7 +379,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
float3 *shadow)
{
bool blocked, is_transparent_isect;
- if (skip_object == OBJECT_NONE) {
+ if(skip_object == OBJECT_NONE) {
blocked = scene_intersect(kg,
*ray,
PATH_RAY_SHADOW_OPAQUE,
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index f75e9337bdb..6475d4b66fd 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
}
/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
{
sd->flag &= ~SD_CLOSURE_FLAGS;
sd->randb_closure = 0.0f;
@@ -148,15 +148,35 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei
sd->num_closure_extra = 0;
if(hit) {
- DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
- if(bsdf) {
- bsdf->N = N;
- sd->flag |= bsdf_diffuse_setup(bsdf);
-
- /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
- * can recognize it as not being a regular diffuse closure */
- bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ Bssrdf *bssrdf = (Bssrdf *)sc;
+#ifdef __PRINCIPLED__
+ if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = bssrdf->roughness;
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular Disney principled diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+ }
+ }
+ else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) ||
+ CLOSURE_IS_BSSRDF(bssrdf->type))
+#endif /* __PRINCIPLED__ */
+ {
+ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ sd->flag |= bsdf_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ }
}
}
}
@@ -379,6 +399,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
#else
Ray *ray = &ss_isect->ray;
#endif
+
+ /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+ kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
+
/* Setup new shading point. */
shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
@@ -388,12 +414,11 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
/* Setup diffuse BSDF. */
- subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+ subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
}
-#ifndef __SPLIT_KERNEL__
/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -454,6 +479,10 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
if(ss_isect.num_hits > 0) {
float3 origP = sd->P;
+ /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+ kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
/* setup new shading point */
shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
@@ -479,9 +508,8 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
/* setup diffuse bsdf */
- subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+ subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
}
-#endif /* ! __SPLIT_KERNEL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..aa5b32803a5 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -82,10 +82,10 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions)
# if __CUDA_ARCH__ < 300
/* full-float image */
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
@@ -93,91 +93,93 @@ KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
# else
/* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 623f3728c69..e6a62c42a38 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
#endif
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+# define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+# define SHADER_SORT_LOCAL_SIZE 32
+#else
+# define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
/* device capabilities */
#ifdef __KERNEL_CPU__
# ifdef __KERNEL_SSE2__
@@ -71,21 +83,18 @@ CCL_NAMESPACE_BEGIN
# endif
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# ifndef __SPLIT_KERNEL__
-# define __BRANCHED_PATH__
-# endif
+# define __BRANCHED_PATH__
# ifdef WITH_OSL
# define __OSL__
# endif
+# define __PRINCIPLED__
# define __SUBSURFACE__
# define __CMJ__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
-# ifndef __SPLIT_KERNEL__
-# define __VOLUME_DECOUPLED__
-# define __VOLUME_RECORD_ALL__
-# endif
+# define __VOLUME_DECOUPLED__
+# define __VOLUME_RECORD_ALL__
#endif /* __KERNEL_CPU__ */
#ifdef __KERNEL_CUDA__
@@ -94,10 +103,11 @@ CCL_NAMESPACE_BEGIN
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __SHADOW_RECORD_ALL__
+# define __CMJ__
# ifndef __SPLIT_KERNEL__
# define __BRANCHED_PATH__
-# define __CMJ__
# endif
#endif /* __KERNEL_CUDA__ */
@@ -109,43 +119,44 @@ CCL_NAMESPACE_BEGIN
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
+# define __CMJ__
+# define __BRANCHED_PATH__
# endif /* __KERNEL_OPENCL_NVIDIA__ */
# ifdef __KERNEL_OPENCL_APPLE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
+# define __PRINCIPLED__
+# define __CMJ__
/* TODO(sergey): Currently experimental section is ignored here,
* this is because megakernel in device_opencl does not support
* custom cflags depending on the scene features.
*/
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
-# endif /* __KERNEL_OPENCL_NVIDIA__ */
+# endif /* __KERNEL_OPENCL_APPLE__ */
# ifdef __KERNEL_OPENCL_AMD__
# define __CL_USE_NATIVE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __SUBSURFACE__
+# define __PRINCIPLED__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
+# define __CMJ__
+# define __BRANCHED_PATH__
# endif /* __KERNEL_OPENCL_AMD__ */
# ifdef __KERNEL_OPENCL_INTEL_CPU__
# define __CL_USE_NATIVE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# ifdef __KERNEL_EXPERIMENTAL__
-# define __CMJ__
-# endif
+# define __PRINCIPLED__
+# define __CMJ__
# endif /* __KERNEL_OPENCL_INTEL_CPU__ */
#endif /* __KERNEL_OPENCL__ */
@@ -165,6 +176,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
+#define __DENOISING_FEATURES__
+
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@@ -220,7 +233,13 @@ CCL_NAMESPACE_BEGIN
# undef __TRANSPARENT_SHADOWS__
#endif
#ifdef __NO_SHADOW_TRICKS__
-#undef __SHADOW_TRICKS__
+# undef __SHADOW_TRICKS__
+#endif
+#ifdef __NO_PRINCIPLED__
+# undef __PRINCIPLED__
+#endif
+#ifdef __NO_DENOISING__
+# undef __DENOISING_FEATURES__
#endif
/* Random Numbers */
@@ -303,31 +322,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
- PATH_RAY_CAMERA = 1,
- PATH_RAY_REFLECT = 2,
- PATH_RAY_TRANSMIT = 4,
- PATH_RAY_DIFFUSE = 8,
- PATH_RAY_GLOSSY = 16,
- PATH_RAY_SINGULAR = 32,
- PATH_RAY_TRANSPARENT = 64,
-
- PATH_RAY_SHADOW_OPAQUE = 128,
- PATH_RAY_SHADOW_TRANSPARENT = 256,
+ PATH_RAY_CAMERA = (1 << 0),
+ PATH_RAY_REFLECT = (1 << 1),
+ PATH_RAY_TRANSMIT = (1 << 2),
+ PATH_RAY_DIFFUSE = (1 << 3),
+ PATH_RAY_GLOSSY = (1 << 4),
+ PATH_RAY_SINGULAR = (1 << 5),
+ PATH_RAY_TRANSPARENT = (1 << 6),
+
+ PATH_RAY_SHADOW_OPAQUE = (1 << 7),
+ PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
- PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+ PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = 2048,
+ PATH_RAY_NODE_UNALIGNED = (1 << 11),
- PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+ PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
- PATH_RAY_MIS_SKIP = 4096,
- PATH_RAY_DIFFUSE_ANCESTOR = 8192,
- PATH_RAY_SINGLE_PASS_DONE = 16384,
- PATH_RAY_SHADOW_CATCHER = 32768,
- PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+ PATH_RAY_MIS_SKIP = (1 << 12),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
+ PATH_RAY_SHADOW_CATCHER = (1 << 15),
+ PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+ PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@@ -383,6 +403,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
+typedef enum DenoisingPassOffsets {
+ DENOISING_PASS_NORMAL = 0,
+ DENOISING_PASS_NORMAL_VAR = 3,
+ DENOISING_PASS_ALBEDO = 6,
+ DENOISING_PASS_ALBEDO_VAR = 9,
+ DENOISING_PASS_DEPTH = 12,
+ DENOISING_PASS_DEPTH_VAR = 13,
+ DENOISING_PASS_SHADOW_A = 14,
+ DENOISING_PASS_SHADOW_B = 17,
+ DENOISING_PASS_COLOR = 20,
+ DENOISING_PASS_COLOR_VAR = 23,
+
+ DENOISING_PASS_SIZE_BASE = 26,
+ DENOISING_PASS_SIZE_CLEAN = 3,
+} DenoisingPassOffsets;
+
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@@ -416,6 +452,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
+typedef enum DenoiseFlag {
+ DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
+ DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
+ DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
+ DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
+ DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+ DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+ DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
+ DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
+ DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
+} DenoiseFlag;
+
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@@ -469,8 +517,20 @@ typedef ccl_addr_space struct PathRadiance {
float3 path_total_shaded;
/* Color of the background on which shadow is alpha-overed. */
- float3 shadow_color;
+ float3 shadow_background_color;
+
+ /* Path radiance sum and throughput at the moment when ray hits shadow
+ * catcher object.
+ */
+ float3 shadow_radiance_sum;
+ float shadow_throughput;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ float3 denoising_normal;
+ float3 denoising_albedo;
+ float denoising_depth;
+#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -713,12 +773,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
- float sample_weight \
+ float sample_weight; \
+ float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
- float data[14]; /* pad to 80 bytes */
+ float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@@ -949,6 +1010,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
+#ifdef __DENOISING_FEATURES__
+ float denoising_feature_weight;
+#endif /* __DENOISING_FEATURES__ */
+
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@@ -1126,6 +1191,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
+ int pass_denoising_data;
+ int pass_denoising_clean;
+ int denoising_flags;
+ int pad;
+
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;
@@ -1298,7 +1368,6 @@ typedef ccl_addr_space struct DebugData {
* Queue 3 - Shadow ray cast kernel - AO
* Queeu 4 - Shadow ray cast kernel - direct lighting
*/
-#define NUM_QUEUES 4
/* Queue names */
enum QueueNumber {
@@ -1311,22 +1380,42 @@ enum QueueNumber {
* 3. Rays to be regenerated
* are enqueued here.
*/
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
/* All rays for which a shadow ray should be cast to determine radiance
* contribution for AO are enqueued here.
*/
- QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+ QUEUE_SHADOW_RAY_CAST_AO_RAYS,
/* All rays for which a shadow ray should be cast to determine radiance
* contributing for direct lighting are enqueued here.
*/
- QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+ /* Rays sorted according to shader->id */
+ QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+ /* All rays moving to next iteration of the indirect loop for light */
+ QUEUE_LIGHT_INDIRECT_ITER,
+ /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+ QUEUE_INACTIVE_RAYS,
+# ifdef __VOLUME__
+ /* All rays moving to next iteration of the indirect loop for volumes */
+ QUEUE_VOLUME_INDIRECT_ITER,
+# endif
+# ifdef __SUBSURFACE__
+ /* All rays moving to next iteration of the indirect loop for subsurface */
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+# endif
+#endif /* __BRANCHED_PATH__ */
+
+ NUM_QUEUES
};
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
enum RayState {
RAY_INVALID = 0,
/* Denotes ray is actively involved in path-iteration. */
@@ -1341,14 +1430,25 @@ enum RayState {
RAY_TO_REGENERATE,
/* Denotes ray has been regenerated */
RAY_REGENERATED,
- /* Flag's ray has to execute shadow blocked function in AO part */
- RAY_SHADOW_RAY_CAST_AO = 16,
- /* Flag's ray has to execute shadow blocked function in direct lighting part. */
- RAY_SHADOW_RAY_CAST_DL = 32,
+ /* Denotes ray is moving to next iteration of the branched indirect loop */
+ RAY_LIGHT_INDIRECT_NEXT_ITER,
+ RAY_VOLUME_INDIRECT_NEXT_ITER,
+ RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+ /* Ray flags */
+
+ /* Flags to denote that the ray is currently evaluating the branched indirect loop */
+ RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+ RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+ RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+ RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+ /* Ray is evaluating an iteration of an indirect loop for another thread */
+ RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
};
#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 9c0878249d4..1e472aaf51a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -660,6 +660,7 @@ typedef struct VolumeSegment {
* but the entire segment is needed to do always scattering, rather than probabilistically
* hitting or missing the volume. if we don't know the transmittance at the end of the
* volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
{
@@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
#endif
}
}
+#endif /* __VOLUME_DECOUPLED__ */
/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
* marching.
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..2ed713299fd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleV,
+ float *sampleVV,
+ float *bufferV,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean,
+ float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int *rect,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weight_image,
+ float *variance,
+ float *difference_image,
+ int* rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+ float *out_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+ float *out_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *difference_image,
+ float *image,
+ float *out_image,
+ float *accum_image,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *difference_image,
+ float *buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+ float *accum_image,
+ int* rect,
+ int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..8dc1a8d583c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+ kernel_filter_divide_shadow(sample, tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean, float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+ kernel_filter_get_feature(sample, tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int *rect,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* prefilter_rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
+ kernel_filter_construct_transform(buffer,
+ x, y,
+ load_int4(prefilter_rect),
+ pass_stride,
+ transform,
+ rank,
+ radius,
+ pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weight_image,
+ float *variance,
+ float *difference_image,
+ int *rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+ kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+ float *out_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+ kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+ float *out_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+ kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *difference_image,
+ float *image,
+ float *out_image,
+ float *accum_image,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+ kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *difference_image,
+ float *buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+ kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+ float *accum_image,
+ int *rect,
+ int w)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+ kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+ XtWX += storage_ofs*XTWX_SIZE;
+ XtWY += storage_ofs*XTWY_SIZE;
+ rank += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 16992c681e6..998619ac897 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -95,9 +95,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_float4")) {
texture_image_float4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_float4_"));
- int array_index = id;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_float4_images.size()) {
+ kg->texture_float4_images.resize(array_index+1);
+ }
tex = &kg->texture_float4_images[array_index];
}
@@ -111,9 +114,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_float")) {
texture_image_float *tex = NULL;
int id = atoi(name + strlen("__tex_image_float_"));
- int array_index = id - TEX_START_FLOAT_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_float_images.size()) {
+ kg->texture_float_images.resize(array_index+1);
+ }
tex = &kg->texture_float_images[array_index];
}
@@ -127,9 +133,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_byte4")) {
texture_image_uchar4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_byte4_"));
- int array_index = id - TEX_START_BYTE4_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_byte4_images.size()) {
+ kg->texture_byte4_images.resize(array_index+1);
+ }
tex = &kg->texture_byte4_images[array_index];
}
@@ -143,9 +152,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_byte")) {
texture_image_uchar *tex = NULL;
int id = atoi(name + strlen("__tex_image_byte_"));
- int array_index = id - TEX_START_BYTE_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_byte_images.size()) {
+ kg->texture_byte_images.resize(array_index+1);
+ }
tex = &kg->texture_byte_images[array_index];
}
@@ -159,9 +171,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_half4")) {
texture_image_half4 *tex = NULL;
int id = atoi(name + strlen("__tex_image_half4_"));
- int array_index = id - TEX_START_HALF4_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_half4_images.size()) {
+ kg->texture_half4_images.resize(array_index+1);
+ }
tex = &kg->texture_half4_images[array_index];
}
@@ -175,9 +190,12 @@ void kernel_tex_copy(KernelGlobals *kg,
else if(strstr(name, "__tex_image_half")) {
texture_image_half *tex = NULL;
int id = atoi(name + strlen("__tex_image_half_"));
- int array_index = id - TEX_START_HALF_CPU;
+ int array_index = kernel_tex_index(id);
- if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) {
+ if(array_index >= 0) {
+ if(array_index >= kg->texture_half_images.size()) {
+ kg->texture_half_images.resize(array_index+1);
+ }
tex = &kg->texture_half_images[array_index];
}
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 896b80d783e..c8938534fe8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -77,16 +77,17 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index af68907a5c2..f6bb4c25012 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -23,51 +23,59 @@ CCL_NAMESPACE_BEGIN
ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y);
- else
- return kg->texture_float4_images[tex].interp(x, y);
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y);
+ }
}
ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z);
- else
- return kg->texture_float4_images[tex].interp_3d(x, y, z);
-
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+ }
}
ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
{
- if(tex >= TEX_START_HALF_CPU)
- return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_BYTE_CPU)
- return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_FLOAT_CPU)
- return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_HALF4_CPU)
- return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation);
- else if(tex >= TEX_START_BYTE4_CPU)
- return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation);
- else
- return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation);
+ switch(kernel_tex_type(tex)) {
+ case IMAGE_DATA_TYPE_HALF:
+ return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_BYTE:
+ return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_HALF4:
+ return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ default:
+ return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 148b2eef568..d4315ee5ec4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -22,38 +22,50 @@
#include "kernel/kernel_compat_cpu.h"
-#ifndef __SPLIT_KERNEL__
-# include "kernel/kernel_math.h"
-# include "kernel/kernel_types.h"
-
-# include "kernel/split/kernel_split_data.h"
-# include "kernel/kernel_globals.h"
-
-# include "kernel/kernels/cpu/kernel_cpu_image.h"
-# include "kernel/kernel_film.h"
-# include "kernel/kernel_path.h"
-# include "kernel/kernel_path_branched.h"
-# include "kernel/kernel_bake.h"
+#ifndef KERNEL_STUB
+# ifndef __SPLIT_KERNEL__
+# include "kernel/kernel_math.h"
+# include "kernel/kernel_types.h"
+
+# include "kernel/split/kernel_split_data.h"
+# include "kernel/kernel_globals.h"
+
+# include "kernel/kernels/cpu/kernel_cpu_image.h"
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_path.h"
+# include "kernel/kernel_path_branched.h"
+# include "kernel/kernel_bake.h"
+# else
+# include "kernel/split/kernel_split_common.h"
+
+# include "kernel/split/kernel_data_init.h"
+# include "kernel/split/kernel_path_init.h"
+# include "kernel/split/kernel_scene_intersect.h"
+# include "kernel/split/kernel_lamp_emission.h"
+# include "kernel/split/kernel_do_volume.h"
+# include "kernel/split/kernel_queue_enqueue.h"
+# include "kernel/split/kernel_indirect_background.h"
+# include "kernel/split/kernel_shader_setup.h"
+# include "kernel/split/kernel_shader_sort.h"
+# include "kernel/split/kernel_shader_eval.h"
+# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+# include "kernel/split/kernel_subsurface_scatter.h"
+# include "kernel/split/kernel_direct_lighting.h"
+# include "kernel/split/kernel_shadow_blocked_ao.h"
+# include "kernel/split/kernel_shadow_blocked_dl.h"
+# include "kernel/split/kernel_enqueue_inactive.h"
+# include "kernel/split/kernel_next_iteration_setup.h"
+# include "kernel/split/kernel_indirect_subsurface.h"
+# include "kernel/split/kernel_buffer_update.h"
+# endif /* __SPLIT_KERNEL__ */
#else
-# include "kernel/split/kernel_split_common.h"
-
-# include "kernel/split/kernel_data_init.h"
-# include "kernel/split/kernel_path_init.h"
-# include "kernel/split/kernel_scene_intersect.h"
-# include "kernel/split/kernel_lamp_emission.h"
-# include "kernel/split/kernel_do_volume.h"
-# include "kernel/split/kernel_queue_enqueue.h"
-# include "kernel/split/kernel_indirect_background.h"
-# include "kernel/split/kernel_shader_eval.h"
-# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-# include "kernel/split/kernel_subsurface_scatter.h"
-# include "kernel/split/kernel_direct_lighting.h"
-# include "kernel/split/kernel_shadow_blocked_ao.h"
-# include "kernel/split/kernel_shadow_blocked_dl.h"
-# include "kernel/split/kernel_next_iteration_setup.h"
-# include "kernel/split/kernel_indirect_subsurface.h"
-# include "kernel/split/kernel_buffer_update.h"
-#endif
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+
+# ifdef __SPLIT_KERNEL__
+# include "kernel/split/kernel_data_init.h"
+# endif /* __SPLIT_KERNEL__ */
+#endif /* KERNEL_STUB */
CCL_NAMESPACE_BEGIN
@@ -69,7 +81,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -80,10 +95,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
-#endif
+# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
+#endif /* KERNEL_STUB */
}
/* Film */
@@ -96,6 +112,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -103,6 +122,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -113,6 +133,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -120,6 +143,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -134,9 +158,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader);
+#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -145,7 +172,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
-#endif
+# endif
}
else {
kernel_shader_evaluate(kg,
@@ -156,24 +183,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
+#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+
+# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+#else
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
ccl_local type locals; \
kernel_##name(kg, &locals); \
}
+#endif /* KERNEL_STUB */
DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
@@ -181,49 +223,22 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
- REGISTER(path_trace);
- REGISTER(convert_to_byte);
- REGISTER(convert_to_half_float);
- REGISTER(shader);
-
- REGISTER(data_init);
- REGISTER(path_init);
- REGISTER(scene_intersect);
- REGISTER(lamp_emission);
- REGISTER(do_volume);
- REGISTER(queue_enqueue);
- REGISTER(indirect_background);
- REGISTER(shader_eval);
- REGISTER(holdout_emission_blurring_pathtermination_ao);
- REGISTER(subsurface_scatter);
- REGISTER(direct_lighting);
- REGISTER(shadow_blocked_ao);
- REGISTER(shadow_blocked_dl);
- REGISTER(next_iteration_setup);
- REGISTER(indirect_subsurface);
- REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
#endif /* __SPLIT_KERNEL__ */
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..009c3fde9d5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+ TilesInfo *tiles,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ float *mean,
+ float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+ float *variance,
+ float *depth,
+ float *output,
+ int4 prefilter_rect,
+ int pass_stride)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ float *transform, int *rank,
+ int4 filter_area, int4 rect,
+ int radius, float pca_threshold,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int *l_rank = rank + y*filter_area.z + x;
+ float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+ const float *ccl_restrict weight_image,
+ const float *ccl_restrict variance_image,
+ float *difference_image,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict image,
+ float *out_image, float *accum_image,
+ int4 rect, int w,
+ int f)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict buffer,
+ float const* __restrict__ transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+ int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+ float *buffer, int *rank,
+ float *XtWX, float3 *XtWY,
+ int4 filter_area, int4 buffer_params,
+ int sample)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index a679eff8409..628891b1458 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -31,12 +31,15 @@
#include "kernel/split/kernel_do_volume.h"
#include "kernel/split/kernel_queue_enqueue.h"
#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
#include "kernel/split/kernel_shader_eval.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
#include "kernel/split/kernel_subsurface_scatter.h"
#include "kernel/split/kernel_direct_lighting.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
#include "kernel/split/kernel_next_iteration_setup.h"
#include "kernel/split/kernel_indirect_subsurface.h"
#include "kernel/split/kernel_buffer_update.h"
@@ -108,12 +111,15 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..ba53ba4b26f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *output,
+ int4 prefilter_rect,
+ int pass_stride)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+ }
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 prefilter_rect,
+ int r)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int4 filter_area,
+ int4 rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ ccl_global int *l_rank = rank + y*filter_area.z + x;
+ ccl_global float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ ccl_global float *difference_image,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict image,
+ ccl_global float *out_image,
+ ccl_global float *accum_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+ const ccl_global float *ccl_restrict accum_image,
+ int4 rect,
+ int w)
+{
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+ int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+ int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+ int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+ int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 filter_area,
+ int4 buffer_params,
+ int sample)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+ ccl_global float *buffer_1,
+ ccl_global float *buffer_2,
+ ccl_global float *buffer_3,
+ ccl_global float *buffer_4,
+ ccl_global float *buffer_5,
+ ccl_global float *buffer_6,
+ ccl_global float *buffer_7,
+ ccl_global float *buffer_8,
+ ccl_global float *buffer_9)
+{
+ if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+ tiles->buffers[0] = buffer_1;
+ tiles->buffers[1] = buffer_2;
+ tiles->buffers[2] = buffer_3;
+ tiles->buffers[3] = buffer_4;
+ tiles->buffers[4] = buffer_5;
+ tiles->buffers[5] = buffer_6;
+ tiles->buffers[6] = buffer_7;
+ tiles->buffers[7] = buffer_8;
+ tiles->buffers[8] = buffer_9;
+ }
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_buffer_update.h"
-__kernel void kernel_ocl_path_trace_buffer_update(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_direct_lighting.h"
-__kernel void kernel_ocl_path_trace_direct_lighting(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_do_volume.h"
-__kernel void kernel_ocl_path_trace_do_volume(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local BackgroundAOLocals locals;
- kernel_holdout_emission_blurring_pathtermination_ao(
- (KernelGlobals*)kg,
- &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_background.h"
-__kernel void kernel_ocl_path_trace_indirect_background(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
index 998ebc4c0c3..84938b889e5 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_subsurface.h"
-__kernel void kernel_ocl_path_trace_indirect_subsurface(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_subsurface((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 822d2287715..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_lamp_emission.h"
-__kernel void kernel_ocl_path_trace_lamp_emission(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_lamp_emission((KernelGlobals*)kg);
-}
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d207253a40..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_next_iteration_setup.h"
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
index bd9aa9538c8..fa210e747c0 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_path_init.h"
-__kernel void kernel_ocl_path_trace_path_init(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_path_init((KernelGlobals*)kg);
-}
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 9be154e3d75..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_queue_enqueue.h"
-__kernel void kernel_ocl_path_trace_queue_enqueue(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local QueueEnqueueLocals locals;
- kernel_queue_enqueue((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index eb4fb4d153a..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_scene_intersect.h"
-__kernel void kernel_ocl_path_trace_scene_intersect(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_scene_intersect((KernelGlobals*)kg);
-}
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 6baee460986..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -18,10 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_eval.h"
-__kernel void kernel_ocl_path_trace_shader_eval(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..8c36100f762
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..bcacaa4a054
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
index 6a8ef81b32a..8de250a375c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_ao((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
index b255cc5ef8b..29da77022ed 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_dl((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 732cda30115..651addb02f4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -23,12 +23,15 @@
#include "kernel/kernels/opencl/kernel_do_volume.cl"
#include "kernel/kernels/opencl/kernel_indirect_background.cl"
#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
#include "kernel/kernels/opencl/kernel_shader_eval.cl"
#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
#include "kernel/kernels/opencl/kernel_buffer_update.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..f1e914a70d4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+ ccl_global char *kg_global,
+ ccl_constant KernelData *data,
+
+ ccl_global void *split_data_buffer,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name,
+#include "kernel/kernel_textures.h"
+
+ ccl_global int *queue_index,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pools,
+ ccl_global float *buffer
+ )
+{
+#ifdef LOCALS_TYPE
+ ccl_local LOCALS_TYPE locals;
+#endif
+
+ KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+ if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+ kg->data = data;
+
+ kernel_split_params.rng_state = rng_state;
+ kernel_split_params.queue_index = queue_index;
+ kernel_split_params.use_queues_flag = use_queues_flag;
+ kernel_split_params.work_pools = work_pools;
+ kernel_split_params.buffer = buffer;
+
+ split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+ kg->name = name;
+#include "kernel/kernel_textures.h"
+ }
+
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+ kg
+#ifdef LOCALS_TYPE
+ , &locals
+#endif
+ );
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 7a1838e485f..2b3be38df84 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -18,10 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_subsurface_scatter.h"
-__kernel void kernel_ocl_path_trace_subsurface_scatter(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 95beea01d25..27a96720c1e 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -39,7 +39,9 @@
#include "kernel/kernel_montecarlo.h"
#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
#include "kernel/closure/bssrdf.h"
CCL_NAMESPACE_BEGIN
@@ -78,6 +80,7 @@ public:
bssrdf->albedo = albedo.x;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
@@ -89,6 +92,7 @@ public:
bssrdf->albedo = albedo.y;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
@@ -100,6 +104,7 @@ public:
bssrdf->albedo = albedo.z;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
+ bssrdf->roughness = params.roughness;
sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
}
@@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params()
CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure)
+/* Disney principled */
+
+class PrincipledBSSRDFClosure : public CBSSRDFClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+};
+
+ClosureParam *closure_bssrdf_principled_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius),
+ CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur),
+ CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo),
+ CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness),
+ CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure)
+ };
+ return params;
+}
+
+CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure)
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index f44714c2150..14c5c1c3db5 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -60,6 +60,8 @@
#include "kernel/closure/bsdf_ashikhmin_shirley.h"
#include "kernel/closure/bsdf_toon.h"
#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
#include "kernel/closure/volume.h"
CCL_NAMESPACE_BEGIN
@@ -154,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -162,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+ CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -176,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE)
+ CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N),
+ CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
+BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
+
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE)
+ CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+
+/* DISNEY PRINCIPLED CLEARCOAT */
+class PrincipledClearcoatClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float clearcoat, clearcoat_roughness;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+
+ bsdf->ior = 1.5f;
+
+ bsdf->alpha_x = clearcoat_roughness;
+ bsdf->alpha_y = clearcoat_roughness;
+
+ bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+ bsdf->extra->clearcoat = clearcoat;
+
+ return bsdf;
+ }
+
+ return NULL;
+ }
+
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_principled_clearcoat_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
+ CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
+ CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
+ CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
+
+
/* Registration */
static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare)
@@ -215,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare);
register_closure(ss, "microfacet_multi_ggx_aniso", id++,
closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare);
+ register_closure(ss, "microfacet_ggx_fresnel", id++,
+ closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare);
+ register_closure(ss, "microfacet_ggx_aniso_fresnel", id++,
+ closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
+ register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++,
+ closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
register_closure(ss, "microfacet_beckmann", id++,
bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
register_closure(ss, "microfacet_beckmann_aniso", id++,
@@ -229,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
register_closure(ss, "glossy_toon", id++,
bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
+ register_closure(ss, "principled_diffuse", id++,
+ bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare);
+ register_closure(ss, "principled_sheen", id++,
+ bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+ register_closure(ss, "principled_clearcoat", id++,
+ closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare);
register_closure(ss, "emission", id++,
closure_emission_params(), closure_emission_prepare);
@@ -248,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
register_closure(ss, "bssrdf_burley", id++,
closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare);
+ register_closure(ss, "bssrdf_principled", id++,
+ closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare);
register_closure(ss, "hair_reflection", id++,
bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
@@ -278,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
return false;
}
+
+/* GGX closures with Fresnel */
+
+class MicrofacetFresnelClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float3 color;
+ float3 cspec0;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ /* Technically, the MultiGGX Glass closure may also transmit. However,
+ * since this is set statically and only used for caustic flags, this
+ * is probably as good as it gets. */
+ if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+ bsdf->extra->color = color;
+ bsdf->extra->cspec0 = cspec0;
+ return bsdf;
+ }
+ }
+
+ return NULL;
+ }
+};
+
+class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
+
+class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure);
+
+
/* Multiscattering GGX closures */
class MicrofacetMultiClosure : public CBSDFClosure {
@@ -287,7 +444,7 @@ public:
MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
{
- /* Technically, the MultiGGX Glass closure may also transmit. However,
+ /* Technically, the MultiGGX closure may also transmit. However,
* since this is set statically and only used for caustic flags, this
* is probably as good as it gets. */
if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
@@ -375,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
}
CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
+
+/* Multiscattering GGX closures with Fresnel */
+
+class MicrofacetMultiFresnelClosure : public CBSDFClosure {
+public:
+ MicrofacetBsdf params;
+ float3 color;
+ float3 cspec0;
+
+ MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+ {
+ /* Technically, the MultiGGX closure may also transmit. However,
+ * since this is set statically and only used for caustic flags, this
+ * is probably as good as it gets. */
+ if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+ if(bsdf && extra) {
+ bsdf->extra = extra;
+ bsdf->extra->color = color;
+ bsdf->extra->cspec0 = cspec0;
+ return bsdf;
+ }
+ }
+
+ return NULL;
+ }
+};
+
+class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure);
+
+class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure);
+
+class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+ MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {}
+
+ void setup(ShaderData *sd, int path_flag, float3 weight)
+ {
+ MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+ sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0;
+ }
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
+{
+ static ClosureParam params[] = {
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+ CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+ CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+ CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+ CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+ };
+ return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure);
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 929cf00a7e6..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params();
OSL::ClosureParam *closure_bssrdf_cubic_params();
OSL::ClosureParam *closure_bssrdf_gaussian_params();
OSL::ClosureParam *closure_bssrdf_burley_params();
+OSL::ClosureParam *closure_bssrdf_principled_params();
OSL::ClosureParam *closure_henyey_greenstein_volume_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
void closure_background_prepare(OSL::RendererServices *, int id, void *data);
@@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data
void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data);
void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
#define CCLOSURE_PREPARE(name, classname) \
void name(RendererServices *, int id, void *data) \
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index b767c60c617..1535496c73d 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
TypeDesc type, ustring name, void *val)
{
- if(sg->renderstate == NULL)
+ if(sg == NULL || sg->renderstate == NULL)
return false;
ShaderData *sd = (ShaderData *)(sg->renderstate);
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b43f8402d42..1a8ed4c884a 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -81,13 +81,15 @@ set(SRC_OSL
node_wireframe.osl
node_hair_bsdf.osl
node_uv_map.osl
+ node_principled_bsdf.osl
node_rgb_to_bw.osl
)
set(SRC_OSL_HEADERS
- node_texture.h
node_color.h
node_fresnel.h
+ node_ramp_util.h
+ node_texture.h
stdosl.h
oslutil.h
)
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
new file mode 100644
index 00000000000..6870d479af3
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+#include "node_fresnel.h"
+
+shader node_principled_bsdf(
+ string distribution = "Multiscatter GGX",
+ color BaseColor = color(0.8, 0.8, 0.8),
+ float Subsurface = 0.0,
+ vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
+ color SubsurfaceColor = color(0.7, 0.1, 0.1),
+ float Metallic = 0.0,
+ float Specular = 0.5,
+ float SpecularTint = 0.0,
+ float Roughness = 0.5,
+ float Anisotropic = 0.0,
+ float AnisotropicRotation = 0.0,
+ float Sheen = 0.0,
+ float SheenTint = 0.5,
+ float Clearcoat = 0.0,
+ float ClearcoatRoughness = 0.03,
+ float IOR = 1.45,
+ float Transmission = 0.0,
+ float TransmissionRoughness = 0.0,
+ normal Normal = N,
+ normal ClearcoatNormal = N,
+ normal Tangent = normalize(dPdu),
+ output closure color BSDF = 0)
+{
+ float f = max(IOR, 1e-5);
+ float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+ float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+ float specular_weight = (1.0 - final_transmission);
+
+ vector T = Tangent;
+
+ float m_cdlum = luminance(BaseColor);
+ color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat
+
+ /* rotate tangent */
+ if (AnisotropicRotation != 0.0)
+ T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
+
+ if (diffuse_weight > 1e-5) {
+ if (Subsurface > 1e-5) {
+ color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+ BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness);
+ } else {
+ BSDF = BaseColor * principled_diffuse(Normal, Roughness);
+ }
+
+ if (Sheen > 1e-5) {
+ color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint;
+
+ BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal);
+ }
+
+ BSDF = BSDF * diffuse_weight;
+ }
+
+ if (specular_weight > 1e-5) {
+ float aspect = sqrt(1.0 - Anisotropic * 0.9);
+ float r2 = Roughness * Roughness;
+
+ float alpha_x = r2 / aspect;
+ float alpha_y = r2 * aspect;
+
+ color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
+
+ color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic;
+
+ if (distribution == "GGX" || Roughness <= 0.075) {
+ BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+ } else {
+ BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+ }
+ }
+
+ if (final_transmission > 1e-5) {
+ color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
+ float eta = backfacing() ? 1.0 / f : f;
+
+ if (distribution == "GGX" || Roughness <= 5e-2) {
+ float cosNO = dot(Normal, I);
+ float Fr = fresnel_dielectric_cos(cosNO, eta);
+
+ float refl_roughness = Roughness;
+ if (Roughness <= 1e-2)
+ refl_roughness = 0.0;
+
+ float transmission_roughness = refl_roughness;
+ if (distribution == "GGX")
+ transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
+
+ BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+ (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
+ } else {
+ BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+ }
+ }
+
+ if (Clearcoat > 1e-5) {
+ BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
+ }
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index a8dda8a12c9..c91d2918687 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
closure color microfacet_beckmann(normal N, float ab) BUILTIN;
closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
@@ -539,11 +544,15 @@ closure color emission() BUILTIN;
closure color background() BUILTIN;
closure color holdout() BUILTIN;
closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
// BSSRDF
closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN;
closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN;
+closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN;
// Hair
closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..e2762a85fc8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+ branched_state->name = kernel_split_state.name[ray_index];
+
+ BRANCHED_STORE(path_state);
+ BRANCHED_STORE(throughput);
+ BRANCHED_STORE(ray);
+ BRANCHED_STORE(sd);
+ BRANCHED_STORE(isect);
+ BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+ /* set loop counters to intial position */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* restore state */
+#define BRANCHED_RESTORE(name) \
+ kernel_split_state.name[ray_index] = branched_state->name;
+
+ BRANCHED_RESTORE(path_state);
+ BRANCHED_RESTORE(throughput);
+ BRANCHED_RESTORE(ray);
+ BRANCHED_RESTORE(sd);
+ BRANCHED_RESTORE(isect);
+ BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+ /* leave indirect loop */
+ REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+ if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+ return false;
+ }
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+ SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+ kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+ kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+ kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+ path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+ inactive_L->direct_throughput = L->direct_throughput;
+ path_radiance_copy_indirect(inactive_L, L);
+
+ ray_state[inactive_ray] = RAY_REGENERATED;
+ ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+ ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+ atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+ return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+ int ray_index,
+ float num_samples_adjust,
+ ShaderData *saved_sd,
+ bool reset_path_state,
+ bool wait_for_shared)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = saved_sd;
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ float3 throughput = branched_state->throughput;
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(ps->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
+ for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF(sc->type))
+ continue;
+ /* transparency is not handled here, but in outer loop */
+ if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ continue;
+
+ int num_samples;
+
+ if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+ num_samples = kernel_data.integrator.diffuse_samples;
+ else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+ num_samples = 1;
+ else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+ num_samples = kernel_data.integrator.glossy_samples;
+ else
+ num_samples = kernel_data.integrator.transmission_samples;
+
+ num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+ float num_samples_inv = num_samples_adjust/num_samples;
+ RNG bsdf_rng = cmj_hash(rng, i);
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ if(reset_path_state) {
+ *ps = branched_state->path_state;
+ }
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = throughput;
+
+ ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+ if(!kernel_branched_path_surface_bounce(kg,
+ &bsdf_rng,
+ sd,
+ sc,
+ j,
+ num_samples,
+ tp,
+ ps,
+ L,
+ bsdf_ray,
+ sum_sample_weight))
+ {
+ continue;
+ }
+
+ /* update state for next iteration */
+ branched_state->next_closure = i;
+ branched_state->next_sample = j+1;
+ branched_state->num_samples = num_samples;
+
+ /* start the indirect path */
+ *tp *= num_samples_inv;
+
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+
+ return true;
+ }
+
+ branched_state->next_sample = 0;
+ }
+
+ branched_state->next_closure = sd->num_closure;
+
+ if(wait_for_shared) {
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..4c1fdd2d69c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,14 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- float3 L_sum;
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, L);
- }
- kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
- float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L_rad);
+ bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 9d3d01fff75..e4545d66eff 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
unsigned int num_samples,
ccl_global float *buffer)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
#ifdef __KERNEL_OPENCL__
kg->data = data;
#endif
@@ -105,21 +109,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
/* Initialize queue data and queue index. */
if(thread_index < queuesize) {
- /* Initialize active ray queue. */
- kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize background and buffer update queue. */
- kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of AO queue. */
- kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of direct lighting queue. */
- kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ }
}
if(thread_index == 0) {
- Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ Queue_index[i] = 0;
+ }
+
/* The scene-intersect kernel should not use the queues very first time.
* since the queue would be empty.
*/
@@ -148,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
*(rng_state + index) = hash_int_2d(x, y);
}
}
+
+#endif /* KERENL_STUB */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index bdbf7387b95..3336c968a44 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_params.queue_size,
0);
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
/* direct lighting */
#ifdef __EMISSION__
RNG rng = kernel_split_state.rng[ray_index];
+
bool flag = (kernel_data.integrator.use_direct_light &&
(sd->flag & SD_BSDF_HAS_EVAL));
+
+# ifdef __BRANCHED_PATH__
+ if(flag && kernel_data.integrator.branched) {
+ flag = false;
+ enqueue_flag = 1;
+ }
+# endif /* __BRANCHED_PATH__ */
+
# ifdef __SHADOW_TRICKS__
if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
flag = false;
- ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
- float3 throughput = kernel_split_state.throughput[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- kernel_branched_path_surface_connect_light(kg,
- &rng,
- sd,
- emission_sd,
- state,
- throughput,
- 1.0f,
- L,
- 1);
+ enqueue_flag = 1;
}
# endif /* __SHADOW_TRICKS__ */
+
if(flag) {
/* Sample illumination from lights to find path contribution. */
float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_state.bsdf_eval[ray_index] = L_light;
kernel_split_state.is_lamp[ray_index] = is_lamp;
/* Mark ray state for next shadow kernel. */
- ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
enqueue_flag = 1;
}
}
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
#endif /* __EMISSION__ */
}
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
#ifdef __EMISSION__
/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
kernel_split_state.queue_data,
kernel_split_params.queue_index);
#endif
+
+#ifdef __BRANCHED_PATH__
+ /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+ * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+ */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_LIGHT_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#endif /* __BRANCHED_PATH__ */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 47d3c280831..9f8dd2392d9 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -16,6 +16,100 @@
CCL_NAMESPACE_BEGIN
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ /* GPU: no decoupled ray marching, scatter probalistically */
+ int num_samples = kernel_data.integrator.volume_samples;
+ float num_samples_inv = 1.0f/num_samples;
+
+ Ray volume_ray = branched_state->ray;
+ volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+ *ps = branched_state->path_state;
+
+ ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+ *pray = branched_state->ray;
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = branched_state->throughput * num_samples_inv;
+
+ /* branch RNG state */
+ path_state_branch(ps, j, num_samples);
+
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+# ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+ /* indirect light bounce */
+ if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+ continue;
+ }
+
+ /* start the indirect path */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = j+1;
+ branched_state->num_samples = num_samples;
+
+ /* Attempting to share too many samples is slow for volumes as it causes us to
+ * loop here more and have many calls to kernel_volume_integrate which evaluates
+ * shaders. The many expensive shader evaluations cause the work load to become
+ * unbalanced and many threads to become idle in this kernel. Limiting the
+ * number of shared samples here helps quite a lot.
+ */
+ if(branched_state->shared_sample_count < 2) {
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+ }
+
+ return true;
+ }
+# endif
+ }
+
+ branched_state->next_sample = num_samples;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ /* todo: avoid this calculation using decoupled ray marching */
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+ kernel_split_state.throughput[ray_index] = throughput;
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __VOLUME__ */
ccl_device void kernel_do_volume(KernelGlobals *kg)
{
@@ -23,37 +117,36 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
/* We will empty this queue in this kernel. */
if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+# ifdef __BRANCHED_PATH__
+ kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+# endif /* __BRANCHED_PATH__ */
}
- /* Fetch use_queues_flag. */
- char local_use_queues_flag = *kernel_split_params.use_queues_flag;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if(local_use_queues_flag) {
+
+ if(*kernel_split_params.use_queues_flag) {
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
kernel_split_params.queue_size,
1);
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
}
- if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ ccl_global char *ray_state = kernel_split_state.ray_state;
- bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
RNG rng = kernel_split_state.rng[ray_index];
ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
- ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
/* Sanitize volume stack. */
if(!hit) {
@@ -64,31 +157,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
Ray volume_ray = *ray;
volume_ray.t = (hit)? isect->t: FLT_MAX;
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+# ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+# endif /* __BRANCHED_PATH__ */
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+ {
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
- if(result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
- /* indirect light bounce */
- if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
- else
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_path_end(kg, ray_index);
+ }
+ }
+# endif /* __VOLUME_SCATTER__ */
}
-# endif
+
+# ifdef __BRANCHED_PATH__
}
+ else {
+ kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
}
+
kernel_split_state.rng[ray_index] = rng;
}
-#endif
+# ifdef __BRANCHED_PATH__
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_VOLUME_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+
+#endif /* __VOLUME__ */
}
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+ /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ char enqueue_flag = 0;
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+ enqueue_flag = 1;
+ }
+
+ enqueue_ray_index_local(ray_index,
+ QUEUE_INACTIVE_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 9fc853a84bf..fec671be016 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
* - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
* flag RAY_SHADOW_RAY_CAST_AO
*/
+
ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
KernelGlobals *kg,
ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
}
ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#ifdef __AO__
char enqueue_flag = 0;
- char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -122,14 +124,22 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if (state->flag & PATH_RAY_CAMERA) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ if(state->flag & PATH_RAY_CAMERA) {
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ state->flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_SHADOW_CATCHER_ONLY |
+ PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+ L->shadow_background_color = indirect_background(
+ kg,
+ &kernel_split_state.sd_DL_shadow[ray_index],
+ state,
+ ray);
}
+ L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+ L->shadow_throughput = average(throughput);
}
}
else {
@@ -155,8 +165,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
}
if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
}
#endif /* __HOLDOUT__ */
@@ -164,18 +173,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- /* Holdout mask objects do not write data passes. */
- kernel_write_data_passes(kg,
- buffer,
- L,
- sd,
- sample,
- state,
- throughput);
+
+#ifdef __BRANCHED_PATH__
+ if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif /* __BRANCHED_PATH__ */
+ {
+ /* Holdout mask objects do not write data passes. */
+ kernel_write_data_passes(kg,
+ buffer,
+ L,
+ sd,
+ sample,
+ state,
+ throughput);
+ }
+
/* Blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy.
*/
- if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+ if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+ if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+ (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif /* __BRANCHED_PATH__ */
+ {
float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
if(blur_pdf < 1.0f) {
float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,85 +223,62 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate.
*/
+#ifndef __BRANCHED_PATH__
float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+ float probability = 1.0f;
+
+ if(!kernel_data.integrator.branched) {
+ probability = path_state_terminate_probability(kg, state, throughput);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+ probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+ }
+ else if(state->flag & PATH_RAY_TRANSPARENT) {
+ probability = path_state_terminate_probability(kg, state, throughput);
+ }
+#endif
if(probability == 0.0f) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
if(probability != 1.0f) {
float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
if(terminate >= probability) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
}
else {
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
+
+ kernel_update_denoising_features(kg, sd, state, L);
}
}
#ifdef __AO__
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
/* ambient occlusion */
- if(kernel_data.integrator.use_ambient_occlusion ||
- (sd->flag & SD_AO))
- {
- /* todo: solve correlation */
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
- float3 ao_D;
- float ao_pdf;
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray _ray;
- _ray.P = ray_offset(sd->P, sd->Ng);
- _ray.D = ao_D;
- _ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
- _ray.time = sd->time;
-#endif
- _ray.dP = sd->dP;
- _ray.dD = differential3_zero();
- kernel_split_state.ao_light_ray[ray_index] = _ray;
-
- ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
- enqueue_flag_AO_SHADOW_RAY_CAST = 1;
- }
+ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+ enqueue_flag = 1;
}
}
#endif /* __AO__ */
- kernel_split_state.rng[ray_index] = rng;
+ kernel_split_state.rng[ray_index] = rng;
#ifndef __COMPUTE_DEVICE_GPU__
}
#endif
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- &locals->queue_atomics_bg,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
#ifdef __AO__
/* Enqueue to-shadow-ray-cast rays. */
enqueue_ray_index_local(ray_index,
QUEUE_SHADOW_RAY_CAST_AO_RAYS,
- enqueue_flag_AO_SHADOW_RAY_CAST,
+ enqueue_flag,
kernel_split_params.queue_size,
&locals->queue_atomics_ao,
kernel_split_state.queue_data,
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index 8192528622e..f0ebb90f60a 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -23,7 +23,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
int ray_index;
- if(kernel_data.integrator.ao_bounces) {
+ if(kernel_data.integrator.ao_bounces != INT_MAX) {
ray_index = get_ray_index(kg, thread_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
if(state->bounce > kernel_data.integrator.ao_bounces) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
}
}
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
#ifdef __PASSES__
if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
path_radiance_accum_background(L, state, (*throughput), L_background);
#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ kernel_split_path_end(kg, ray_index);
}
}
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index a56e85abeb9..82bc2f01fd7 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+ kernel_path_subsurface_accum_indirect(ss_indirect, L);
- /* Trace indirect subsurface rays by restarting the loop. this uses less
- * stack memory than invoking kernel_path_indirect.
- */
- if(ss_indirect->num_rays) {
- kernel_path_subsurface_setup_indirect(kg,
- ss_indirect,
- state,
- ray,
- L,
- throughput);
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- else {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ /* Trace indirect subsurface rays by restarting the loop. this uses less
+ * stack memory than invoking kernel_path_indirect.
+ */
+ if(ss_indirect->num_rays) {
+ kernel_path_subsurface_setup_indirect(kg,
+ ss_indirect,
+ state,
+ ray,
+ L,
+ throughput);
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
}
+#ifdef __BRANCHED_PATH__
}
+#endif
#endif /* __SUBSURFACE__ */
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 1bebc16e25b..7758e35fd32 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
* - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
* RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
*/
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+ /* continue in case of transparency */
+ *throughput *= shader_bsdf_transparency(kg, sd);
+
+ if(is_zero(*throughput)) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else {
+ /* Update Path State */
+ state->flag |= PATH_RAY_TRANSPARENT;
+ state->transparent_bounce++;
+
+ ray->P = ray_offset(sd->P, -sd->Ng);
+ ray->t -= sd->ray_length; /* clipping works through transparent */
+
+# ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD.dx = -sd->dI.dx;
+ ray->dD.dy = -sd->dI.dy;
+# endif /* __RAY_DIFFERENTIALS__ */
+
+# ifdef __VOLUME__
+ /* enter/exit volume */
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+# endif /* __VOLUME__ */
+ }
+}
+#endif /* __BRANCHED_PATH__ */
+
ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
ccl_local_param unsigned int *local_queue_atomics)
{
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
}
- char enqueue_flag = 0;
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,127 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
kernel_split_params.queue_size,
0);
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
- /* Load ShaderData structure. */
- PathRadiance *L = NULL;
- ccl_global PathState *state = NULL;
ccl_global char *ray_state = kernel_split_state.ray_state;
- /* Path radiance update for AO/Direct_lighting's shadow blocked. */
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
- IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
- {
- state = &kernel_split_state.path_state[ray_index];
- L = &kernel_split_state.path_radiance[ray_index];
- float3 _throughput = kernel_split_state.throughput[ray_index];
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
- // TODO(mai): investigate correctness here
- char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
- if(update_path_radiance) {
- path_radiance_accum_ao(L,
- _throughput,
- kernel_split_state.ao_alpha[ray_index],
- kernel_split_state.ao_bsdf[ray_index],
- shadow,
- state->bounce);
- }
- else {
- path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+ bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+ if(active) {
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ /* Compute direct lighting and next bounce. */
+ if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ kernel_split_path_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
}
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- float3 shadow = kernel_split_state.light_ray[ray_index].P;
- // TODO(mai): investigate correctness here
- char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
- BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
- if(update_path_radiance) {
- path_radiance_accum_light(L,
- _throughput,
- &L_light,
- shadow,
- 1.0f,
- state->bounce,
- kernel_split_state.is_lamp[ray_index]);
+ else {
+ kernel_split_branched_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
}
else {
- path_radiance_accum_total_light(L, _throughput, &L_light);
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
}
+#endif /* __BRANCHED_PATH__ */
+
+ kernel_split_state.rng[ray_index] = rng;
}
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
- state = &kernel_split_state.path_state[ray_index];
- L = &kernel_split_state.path_radiance[ray_index];
+ /* Enqueue RAY_UPDATE_BUFFER rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+ /* iter loop */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+ }
- /* Compute direct lighting and next bounce. */
- if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_LIGHT_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
+
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
- kernel_split_state.rng[ray_index] = rng;
}
-#ifndef __COMPUTE_DEVICE_GPU__
+# ifdef __VOLUME__
+ /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
}
-#endif
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
- /* Enqueue RAY_UPDATE_BUFFER rays. */
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
+ QUEUE_VOLUME_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+# endif /* __VOLUME__ */
+
+# ifdef __SUBSURFACE__
+ /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
kernel_split_params.queue_size,
local_queue_atomics,
kernel_split_state.queue_data,
kernel_split_params.queue_index);
+# endif /* __SUBSURFACE__ */
+#endif /* __BRANCHED_PATH__ */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index e2e841f36d3..66ce2dfb6f1 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
int queue_number = -1;
if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
}
else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 5dc94caec85..45984ca509b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
}
/* All regenerated rays become active here */
- if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+ if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else
+#endif /* __BRANCHED_PATH__ */
+ {
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ }
+ }
- if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
+ if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
return;
+ }
#ifdef __KERNEL_DEBUG__
DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 0f1696e34a0..2801b32f285 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,54 +16,61 @@
CCL_NAMESPACE_BEGIN
-/* This kernel sets up the ShaderData structure from the values computed
+/* This kernel evaluates ShaderData structure from the values computed
* by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
*/
-ccl_device void kernel_shader_eval(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
{
- /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
- if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ /* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+ int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+ if(ray_index >= queue_index) {
+ return;
+ }
ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+ QUEUE_SHADER_SORTED_RAYS,
+#endif
kernel_split_state.queue_data,
kernel_split_params.queue_size,
0);
- char enqueue_flag = 0;
- if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
- enqueue_flag = 1;
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
}
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
- /* Continue on with shader evaluation. */
- if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
- Intersection isect = kernel_split_state.isect[ray_index];
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
RNG rng = kernel_split_state.rng[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- Ray ray = kernel_split_state.ray[ray_index];
- shader_setup_from_ray(kg,
- &kernel_split_state.sd[ray_index],
- &isect,
- &ray);
+#ifndef __BRANCHED_PATH__
float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+ ShaderContext ctx = SHADER_CONTEXT_MAIN;
+ float rbsdf = 0.0f;
+
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+ }
+
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ ctx = SHADER_CONTEXT_INDIRECT;
+ }
+
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+ shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif /* __BRANCHED_PATH__ */
+
kernel_split_state.rng[ray_index] = rng;
}
}
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+ /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(ray_index >= queue_index) {
+ return;
+ }
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+ /* Continue on with shader evaluation. */
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ Intersection isect = kernel_split_state.isect[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
+
+ shader_setup_from_ray(kg,
+ &kernel_split_state.sd[ray_index],
+ &isect,
+ &ray);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..297decb0bc2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+ ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+ int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(tid == 0) {
+ kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+ }
+
+ uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+ if(offset >= qsize) {
+ return;
+ }
+
+ int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+ uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+ uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+ ccl_local uint *local_value = &locals->local_value[0];
+ ccl_local ushort *local_index = &locals->local_index[0];
+
+ /* copy to local memory */
+ for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint add = input + idx;
+ uint value = (~0);
+ if(idx < qsize) {
+ int ray_index = kernel_split_state.queue_data[add];
+ bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(valid) {
+ value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+ }
+ }
+ local_value[i + lid] = value;
+ local_index[i + lid] = i + lid;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ /* skip sorting for cpu split kernel */
+# ifdef __KERNEL_OPENCL__
+
+ /* bitonic sort */
+ for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+ for (uint inc = length; inc > 0; inc >>= 1) {
+ for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+ uint i = lid + ii;
+ bool direction = ((i & (length << 1)) != 0);
+ uint j = i ^ inc;
+ ushort ioff = local_index[i];
+ ushort joff = local_index[j];
+ uint iKey = local_value[ioff];
+ uint jKey = local_value[joff];
+ bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+ bool swap = smaller ^ (j < i) ^ direction;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ local_index[i] = (swap) ? joff : ioff;
+ local_index[j] = (swap) ? ioff : joff;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ }
+ }
+ }
+# endif /* __KERNEL_OPENCL__ */
+
+ /* copy to destination */
+ for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint lidx = local_index[i + lid];
+ uint outi = output + idx;
+ uint ini = input + offset + lidx;
+ uint value = local_value[lidx];
+ if(idx < qsize) {
+ kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+ }
+ }
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 4243e18de72..474286285a9 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
}
- if(ray_index == QUEUE_EMPTY_SLOT)
+ if(ray_index == QUEUE_EMPTY_SLOT) {
return;
+ }
- /* Flag determining if we need to update L. */
- char update_path_radiance = 0;
-
- if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
- float3 shadow;
- Ray ray = *light_ray_global;
- update_path_radiance = !(shadow_blocked(kg,
- &kernel_split_state.sd_DL_shadow[ray_index],
- state,
- &ray,
- &shadow));
-
- *light_ray_global = ray;
- /* We use light_ray_global's P and t to store shadow and
- * update_path_radiance.
- */
- light_ray_global->P = shadow;
- light_ray_global->t = update_path_radiance;
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+ }
+ else {
+ kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
}
+#endif
+
+ kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index bb8f0157965..78e61709b01 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -29,31 +29,82 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
}
+#ifdef __BRANCHED_PATH__
+ /* TODO(mai): move this somewhere else? */
+ if(thread_index == 0) {
+ /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+ kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+ }
+#endif /* __BRANCHED_PATH__ */
+
if(ray_index == QUEUE_EMPTY_SLOT)
return;
- /* Flag determining if we need to update L. */
- char update_path_radiance = 0;
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.light_ray[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+
+ BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+ bool use_branched = false;
+ int all = 0;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ use_branched = true;
+ all = 1;
+ }
+# if defined(__BRANCHED_PATH__)
+ else if(kernel_data.integrator.branched) {
+ use_branched = true;
- if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+ if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ all = (kernel_data.integrator.sample_all_lights_indirect);
+ }
+ else
+ {
+ all = (kernel_data.integrator.sample_all_lights_direct);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+ if(use_branched) {
+ kernel_branched_path_surface_connect_light(kg,
+ &rng,
+ sd,
+ emission_sd,
+ state,
+ throughput,
+ 1.0f,
+ L,
+ all);
+ }
+ else
+# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+ {
+ /* trace shadow ray */
float3 shadow;
- Ray ray = *light_ray_global;
- update_path_radiance = !(shadow_blocked(kg,
- &kernel_split_state.sd_DL_shadow[ray_index],
- state,
- &ray,
- &shadow));
-
- *light_ray_global = ray;
- /* We use light_ray_global's P and t to store shadow and
- * update_path_radiance.
- */
- light_ray_global->P = shadow;
- light_ray_global->t = update_path_radiance;
+
+ if(!shadow_blocked(kg,
+ emission_sd,
+ state,
+ &ray,
+ &shadow))
+ {
+ /* accumulate */
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+ }
+ else {
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
+ }
}
+
+ kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 4303ba0a905..08f0124b529 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -37,41 +37,55 @@
#include "util/util_atomic.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-# include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+# include "kernel/kernel_path_branched.h"
#endif
-#ifdef __VOLUME__
-# include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+# include "kernel/split/kernel_branched.h"
+#endif
-#ifdef __KERNEL_DEBUG__
-# include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+ int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_accum_sample(orig_ray_L, L, 1);
+
+ atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+ }
+ else {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ }
+#else
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
#endif
+}
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 17e6587883a..eac22050a38 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -31,14 +31,6 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_
size = size SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */
-#endif
-
-#ifdef __VOLUME__
- size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */
-#endif
-
return size;
}
@@ -57,16 +49,6 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p;
- p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16);
-#endif
-
-#ifdef __VOLUME__
- split_data->state_shadow = (ccl_global PathState*)p;
- p += align_up(2 * num_elements * sizeof(PathState), 16);
-#endif
-
split_data->ray_state = ray_state;
}
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 748197b7183..4bb2f0d3d80 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -43,6 +43,9 @@ typedef struct SplitParams {
ccl_global char *use_queues_flag;
ccl_global float *buffer;
+
+ /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+ int dummy_sd_flag;
} SplitParams;
/* Global memory variables [porting]; These memory is used for
@@ -59,7 +62,64 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
#else
# define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+ /* various state that must be kept and restored after an indirect loop */
+ PathState path_state;
+ float3 throughput;
+ Ray ray;
+
+ struct ShaderData sd;
+ Intersection isect;
+
+ char ray_state;
+
+ /* indirect loop state */
+ int next_closure;
+ int next_sample;
+ int num_samples;
+
+#ifdef __SUBSURFACE__
+ int ss_next_closure;
+ int ss_next_sample;
+ int next_hit;
+ int num_hits;
+
+ uint lcg_state;
+ SubsurfaceIntersection ss_isect;
+
+# ifdef __VOLUME__
+ VolumeStack volume_stack[VOLUME_STACK_SIZE];
+# endif /* __VOLUME__ */
+#endif /*__SUBSURFACE__ */
+
+ int shared_sample_count; /* number of branched samples shared with other threads */
+ int original_ray; /* index of original ray when sharing branched samples */
+ bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+# define SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+# define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+# define SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+# define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
#define SPLIT_DATA_ENTRIES \
SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -69,9 +129,6 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
- SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
- SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -79,6 +136,28 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_DEBUG_ENTRIES \
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
SPLIT_DATA_DEBUG_ENTRIES \
/* struct that holds pointers to data in the shared state buffer */
@@ -87,14 +166,6 @@ typedef struct SplitData {
SPLIT_DATA_ENTRIES
#undef SPLIT_DATA_ENTRY
-#ifdef __SUBSURFACE__
- ccl_global SubsurfaceIndirectRays *ss_rays;
-#endif
-
-#ifdef __VOLUME__
- ccl_global PathState *state_shadow;
-#endif
-
/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
* the host easily) but is still used the same as the other data so we have it here in this struct as well
*/
@@ -122,6 +193,11 @@ typedef struct BackgroundAOLocals {
uint queue_atomics_ao;
} BackgroundAOLocals;
+typedef struct ShaderSortLocals {
+ uint local_value[SHADER_SORT_BLOCK_SIZE];
+ ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
CCL_NAMESPACE_END
#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 0b4d50c70ee..d5083b23f80 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -16,82 +16,306 @@
CCL_NAMESPACE_BEGIN
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
- ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
{
-#ifdef __SUBSURFACE__
- if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ branched_state->ss_next_closure = 0;
+ branched_state->ss_next_sample = 0;
+
+ branched_state->num_hits = 0;
+ branched_state->next_hit = 0;
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &branched_state->sd;
+ RNG rng = kernel_split_state.rng[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSSRDF(sc->type))
+ continue;
+
+ /* set up random number generator */
+ if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+ branched_state->next_closure == 0 && branched_state->next_sample == 0)
+ {
+ branched_state->lcg_state = lcg_state_init(&rng,
+ branched_state->path_state.rng_offset,
+ branched_state->path_state.sample,
+ 0x68bc21eb);
+ }
+ int num_samples = kernel_data.integrator.subsurface_samples;
+ float num_samples_inv = 1.0f/num_samples;
+ RNG bssrdf_rng = cmj_hash(rng, i);
+
+ /* do subsurface scatter step with copy of shader data, this will
+ * replace the BSSRDF with a diffuse BSDF closure */
+ for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+ ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+ float bssrdf_u, bssrdf_v;
+ path_branched_rng_2D(kg,
+ &bssrdf_rng,
+ &branched_state->path_state,
+ j,
+ num_samples,
+ PRNG_BSDF_U,
+ &bssrdf_u,
+ &bssrdf_v);
+
+ /* intersection is expensive so avoid doing multiple times for the same input */
+ if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ RNG lcg_state = branched_state->lcg_state;
+ SubsurfaceIntersection ss_isect_private;
+
+ branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+ &ss_isect_private,
+ sd,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ true);
+
+ branched_state->lcg_state = lcg_state;
+ *ss_isect = ss_isect_private;
+ }
+
+#ifdef __VOLUME__
+ Ray volume_ray = branched_state->ray;
+ bool need_update_volume_stack =
+ kernel_data.integrator.use_volumes &&
+ sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif /* __VOLUME__ */
+
+ /* compute lighting with the BSDF closure */
+ for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+ ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+ *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+ * important as the indirect path will write into bssrdf_sd */
+
+ SubsurfaceIntersection ss_isect_private = *ss_isect;
+ subsurface_scatter_multi_setup(kg,
+ &ss_isect_private,
+ hit,
+ bssrdf_sd,
+ &branched_state->path_state,
+ branched_state->path_state.flag,
+ sc,
+ true);
+ *ss_isect = ss_isect_private;
+
+ ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+ *hit_state = branched_state->path_state;
+
+ path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+ if(need_update_volume_stack) {
+ /* Setup ray from previous surface point to the new one. */
+ float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+ volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+ /* this next part is expensive as it does scene intersection so only do once */
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ branched_state->volume_stack[k] = hit_state->volume_stack[k];
+ }
+
+ kernel_volume_stack_update_for_subsurface(kg,
+ emission_sd,
+ &volume_ray,
+ branched_state->volume_stack);
+ }
+
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ hit_state->volume_stack[k] = branched_state->volume_stack[k];
+ }
+ }
+#endif /* __VOLUME__ */
+
+#ifdef __EMISSION__
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ int all = (kernel_data.integrator.sample_all_lights_direct) ||
+ (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_branched_path_surface_connect_light(kg,
+ &rng,
+ bssrdf_sd,
+ emission_sd,
+ hit_state,
+ branched_state->throughput,
+ num_samples_inv,
+ L,
+ all);
+ }
+ }
+#endif /* __EMISSION__ */
+
+ /* indirect light */
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ num_samples_inv,
+ bssrdf_sd,
+ false,
+ false))
+ {
+ branched_state->ss_next_closure = i;
+ branched_state->ss_next_sample = j;
+ branched_state->next_hit = hit;
+
+ return true;
+ }
+
+ branched_state->next_closure = 0;
+ }
+
+ branched_state->next_hit = 0;
+ }
+
+ branched_state->ss_next_sample = 0;
+ }
+
+ branched_state->ss_next_closure = sd->num_closure;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index == 0) {
+ /* We will empty both queues in this kernel. */
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
}
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
ray_index = get_ray_index(kg, ray_index,
QUEUE_ACTIVE_AND_REGENERATED_RAYS,
kernel_split_state.queue_data,
kernel_split_params.queue_size,
- 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
- char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+ 1);
+ get_ray_index(kg, thread_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+#ifdef __SUBSURFACE__
ccl_global char *ray_state = kernel_split_state.ray_state;
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- ShaderData *sd = &kernel_split_state.sd[ray_index];
- ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
if(sd->flag & SD_BSSRDF) {
- if(kernel_path_subsurface_scatter(kg,
- sd,
- emission_sd,
- L,
- state,
- &rng,
- ray,
- throughput,
- ss_indirect)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(kernel_path_subsurface_scatter(kg,
+ sd,
+ emission_sd,
+ L,
+ state,
+ &rng,
+ ray,
+ throughput,
+ ss_indirect))
+ {
+ kernel_split_path_end(kg, ray_index);
+ }
+#ifdef __BRANCHED_PATH__
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ float bssrdf_probability;
+ ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+ /* modify throughput for picking bssrdf or bsdf */
+ *throughput *= bssrdf_probability;
+
+ /* do bssrdf scatter step if we picked a bssrdf closure */
+ if(sc) {
+ uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ &rng,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
+ subsurface_scatter_step(kg,
+ sd,
+ state,
+ state->flag,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ false);
+ }
+ }
+ else {
+ kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
}
+#endif
}
kernel_split_state.rng[ray_index] = rng;
}
-#ifndef __COMPUTE_DEVICE_GPU__
+# ifdef __BRANCHED_PATH__
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
}
-#endif
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
#endif /* __SUBSURFACE__ */
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 1885e1af851..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -76,6 +76,345 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
switch(type) {
+#ifdef __PRINCIPLED__
+ case CLOSURE_BSDF_PRINCIPLED_ID: {
+ uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
+ sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
+ anisotropic_rotation_offset, transmission_roughness_offset;
+ uint4 data_node2 = read_node(kg, offset);
+
+ float3 T = stack_load_float3(stack, data_node.y);
+ decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
+ decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
+ decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
+
+ // get Disney principled parameters
+ float metallic = param1;
+ float subsurface = param2;
+ float specular = stack_load_float(stack, specular_offset);
+ float roughness = stack_load_float(stack, roughness_offset);
+ float specular_tint = stack_load_float(stack, specular_tint_offset);
+ float anisotropic = stack_load_float(stack, anisotropic_offset);
+ float sheen = stack_load_float(stack, sheen_offset);
+ float sheen_tint = stack_load_float(stack, sheen_tint_offset);
+ float clearcoat = stack_load_float(stack, clearcoat_offset);
+ float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
+ float transmission = stack_load_float(stack, transmission_offset);
+ float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
+ float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
+ float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
+
+ ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+
+ /* rotate tangent */
+ if(anisotropic_rotation != 0.0f)
+ T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
+
+ /* calculate ior */
+ float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
+
+ // calculate fresnel for refraction
+ float cosNO = dot(N, sd->I);
+ float fresnel = fresnel_dielectric_cos(cosNO, ior);
+
+ // calculate weights of the diffuse and specular part
+ float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
+
+ float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+ float specular_weight = (1.0f - final_transmission);
+
+ // get the base color
+ uint4 data_base_color = read_node(kg, offset);
+ float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) :
+ make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w));
+
+ // get the additional clearcoat normal and subsurface scattering radius
+ uint4 data_cn_ssr = read_node(kg, offset);
+ float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
+ float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
+
+ // get the subsurface color
+ uint4 data_subsurface_color = read_node(kg, offset);
+ float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
+ make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
+
+ float3 weight = sd->svm_closure_weight * mix_weight;
+
+#ifdef __SUBSURFACE__
+ float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+ float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+ float subsurf_sample_weight = fabsf(average(subsurf_weight));
+
+ /* disable in case of diffuse ancestor, can't see it well then and
+ * adds considerably noise due to probabilities of continuing path
+ * getting lower and lower */
+ if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+ subsurface = 0.0f;
+
+ /* need to set the base color in this case such that the
+ * rays get the correctly mixed color after transmitting
+ * the object */
+ base_color = mixed_ss_base_color;
+ }
+
+ /* diffuse */
+ if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+ if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+ float3 diff_weight = weight * base_color * diffuse_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+ }
+ }
+ else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
+ /* radius * scale */
+ float3 radius = subsurface_radius * subsurface;
+ /* sharpness */
+ float sharpness = 0.0f;
+ /* texture color blur */
+ float texture_blur = 0.0f;
+
+ /* create one closure per color channel */
+ Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.x;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.x;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+
+ bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.y;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.y;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+
+ bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
+ if(bssrdf) {
+ bssrdf->sample_weight = subsurf_sample_weight;
+ bssrdf->radius = radius.z;
+ bssrdf->texture_blur = texture_blur;
+ bssrdf->albedo = subsurface_color.z;
+ bssrdf->sharpness = sharpness;
+ bssrdf->N = N;
+ bssrdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+ }
+ }
+ }
+#else
+ /* diffuse */
+ if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+ float3 diff_weight = weight * base_color * diffuse_weight;
+
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+ }
+ }
+#endif
+
+ /* sheen */
+ if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
+ float m_cdlum = linear_rgb_to_gray(base_color);
+ float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat
+
+ /* color of the sheen component */
+ float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint;
+
+ float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+
+ PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight);
+
+ if(bsdf) {
+ bsdf->N = N;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_principled_sheen_setup(bsdf);
+ }
+ }
+
+ /* specular reflection */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
+ float3 spec_weight = weight * specular_weight;
+
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
+ bsdf->T = T;
+ bsdf->extra = extra;
+
+ float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
+ float r2 = roughness * roughness;
+
+ bsdf->alpha_x = r2 / aspect;
+ bsdf->alpha_y = r2 * aspect;
+
+ float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
+ float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
+ float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint;
+
+ bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic;
+ bsdf->extra->color = base_color;
+
+ /* setup bsdf */
+ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
+ sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+ else /* use multi-scatter GGX */
+ sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ /* BSDF */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+ float3 glass_weight = weight * final_transmission;
+ float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+
+ if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
+ float refl_roughness = roughness;
+
+ /* reflection */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->extra = extra;
+
+ bsdf->alpha_x = refl_roughness * refl_roughness;
+ bsdf->alpha_y = refl_roughness * refl_roughness;
+ bsdf->ior = ior;
+
+ bsdf->extra->color = base_color;
+ bsdf->extra->cspec0 = cspec0;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+ }
+ }
+
+ /* refraction */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+ {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel));
+
+ if(bsdf) {
+ bsdf->N = N;
+
+ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
+ transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
+ else
+ transmission_roughness = refl_roughness;
+
+ bsdf->alpha_x = transmission_roughness * transmission_roughness;
+ bsdf->alpha_y = transmission_roughness * transmission_roughness;
+ bsdf->ior = ior;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+ }
+ }
+ }
+ else { /* use multi-scatter GGX */
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = N;
+ bsdf->extra = extra;
+ bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+
+ bsdf->alpha_x = roughness * roughness;
+ bsdf->alpha_y = roughness * roughness;
+ bsdf->ior = ior;
+
+ bsdf->extra->color = base_color;
+ bsdf->extra->cspec0 = cspec0;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+ }
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ /* clearcoat */
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+ if(clearcoat > CLOSURE_WEIGHT_CUTOFF) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+ MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+ if(bsdf && extra) {
+ bsdf->N = clearcoat_normal;
+ bsdf->ior = 1.5f;
+ bsdf->extra = extra;
+
+ bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+ bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+
+ bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+ bsdf->extra->clearcoat = clearcoat;
+
+ /* setup bsdf */
+ sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+ }
+ }
+#ifdef __CAUSTICS_TRICKS__
+ }
+#endif
+
+ break;
+ }
+#endif /* __PRINCIPLED__ */
case CLOSURE_BSDF_DIFFUSE_ID: {
float3 weight = sd->svm_closure_weight * mix_weight;
OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
@@ -110,6 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@@ -344,6 +684,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
#ifdef __CAUSTICS_TRICKS__
if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
+ ATTR_FALLTHROUGH;
#endif
case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
float3 weight = sd->svm_closure_weight * mix_weight;
@@ -370,6 +711,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we
@@ -383,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
if(bsdf) {
+ bsdf->N = N;
bsdf->roughness1 = param1;
bsdf->roughness2 = param2;
bsdf->offset = -stack_load_float(stack, data_node.z);
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
- float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
- normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+ if(is_zero(normal_out)) {
+ normal_out = normal_in;
+ }
+ else {
+ normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ }
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
+ default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 76acc9253a1..7be03dcd65a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,29 +16,10 @@
CCL_NAMESPACE_BEGIN
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-# if __CUDA_ARCH__ < 300
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA
-# else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER
-# endif
-#else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL
-#endif
-
ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
{
#ifdef __KERNEL_CPU__
-# ifdef __KERNEL_SSE2__
- ssef r_ssef;
- float4 &r = (float4 &)r_ssef;
- r = kernel_tex_image_interp(id, x, y);
-# else
float4 r = kernel_tex_image_interp(id, x, y);
-# endif
#elif defined(__KERNEL_OPENCL__)
float4 r = kernel_tex_image_interp(kg, id, x, y);
#else
@@ -56,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
switch(id) {
case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
- case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
- case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
- case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
- case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
- case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
- case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
- case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
- case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
+ case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
+ case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
+ case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
+ case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
+ case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
- case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
- case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
- case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
- case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
- case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
- case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
- case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
- case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
- case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
- case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
- case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
- case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
- case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
- case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
- case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
- case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
- case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
- case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
- case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
- case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
- case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
- case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
- case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
- case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
- case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
- case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
- case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
- case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
- case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
- case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
- case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
- case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
- case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
- case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
- case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
- case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
- case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
- case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
- case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
- case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
- case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
- case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
- case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
- case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
- case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
- case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
- case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
- case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
- case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
- case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
- case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
- case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
- case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
- case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
- case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
- case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
- case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
- case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
- case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
- case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
- case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
- case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
- case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
- case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
- case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
- case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
- case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
- case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
- case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
- case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
+ case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
+ case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
+ case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
+ case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
+ case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
+ case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
+ case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
+ case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
+ case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
+ case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
+ case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
+ case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
+ case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
+ case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
+ case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
+ case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
+ case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
+ case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
+ case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
+ case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
+ case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
+ case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
+ case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
+ case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
+ case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
+ case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
+ case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
+ case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
+ case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
+ case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
+ case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
+ case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
+ case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
+ case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
+ case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
+ case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
+ case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
+ case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
+ case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
+ case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
+ case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
+ case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
+ case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
+ case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
+ case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
+ case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
+ case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
+ case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
+ case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
+ case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
+ case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
+ case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
+ case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
+ case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
+ case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
+ case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
+ case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
+ case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
+ case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
+ case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
+ case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
+ case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
+ case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
+ case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
+ case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
+ case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
+ case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
+ case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
+ case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
+ case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
+ case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
+ case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
+ case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
default:
kernel_assert(0);
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -151,8 +132,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
# else
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
/* float4, byte4 and half4 */
- if(id < TEX_START_FLOAT_CUDA_KEPLER)
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4)
+ {
r = kernel_tex_image_interp_float4(tex, x, y);
+ }
/* float, byte and half */
else {
float f = kernel_tex_image_interp_float(tex, x, y);
@@ -161,40 +147,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
# endif
#endif
-#ifdef __KERNEL_SSE2__
- float alpha = r.w;
+ const float alpha = r.w;
if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
- r_ssef = r_ssef / ssef(alpha);
- if(id >= TEX_NUM_FLOAT4_IMAGES)
- r_ssef = min(r_ssef, ssef(1.0f));
- r.w = alpha;
- }
-
- if(srgb) {
- r_ssef = color_srgb_to_scene_linear(r_ssef);
- r.w = alpha;
- }
-#else
- if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
- float invw = 1.0f/r.w;
- r.x *= invw;
- r.y *= invw;
- r.z *= invw;
-
- if(id >= TEX_NUM_FLOAT4_IMAGES) {
- r.x = min(r.x, 1.0f);
- r.y = min(r.y, 1.0f);
- r.z = min(r.z, 1.0f);
+ r /= alpha;
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE)
+ {
+ r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
}
+ r.w = alpha;
}
if(srgb) {
- r.x = color_srgb_to_scene_linear(r.x);
- r.y = color_srgb_to_scene_linear(r.y);
- r.z = color_srgb_to_scene_linear(r.z);
+ r = color_srgb_to_scene_linear_v4(r);
}
-#endif
return r;
}
@@ -336,8 +304,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
- co = normalize(co);
-
+ co = safe_normalize(co);
+
if(projection == 0)
uv = direction_to_equirectangular(co);
else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 47209ddfbab..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -397,17 +397,23 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_ID,
CLOSURE_BSDF_OREN_NAYAR_ID,
CLOSURE_BSDF_DIFFUSE_RAMP_ID,
+ CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
+ CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
- CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
@@ -416,24 +422,26 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
- CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
- CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
/* Special cases */
CLOSURE_BSDF_BSSRDF_ID,
+ CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID,
CLOSURE_BSDF_TRANSPARENT_ID,
/* BSSRDF */
CLOSURE_BSSRDF_CUBIC_ID,
CLOSURE_BSSRDF_GAUSSIAN_ID,
+ CLOSURE_BSSRDF_PRINCIPLED_ID,
CLOSURE_BSSRDF_BURLEY_ID,
/* Other */
@@ -447,19 +455,24 @@ typedef enum ClosureType {
CLOSURE_VOLUME_ABSORPTION_ID,
CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID,
+ CLOSURE_BSDF_PRINCIPLED_ID,
+
NBUILTIN_CLOSURES
} ClosureType;
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
-#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
- type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+ (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -468,7 +481,8 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 9e826c8c23f..f4a5b2b2994 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -46,8 +46,13 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
# if defined(__KERNEL_CUDA__)
# if __CUDA_ARCH__ >= 300
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
- if(id < TEX_START_HALF4_CUDA_KEPLER)
+ const int texture_type = kernel_tex_type(id);
+ if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+ texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4)
+ {
r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
+ }
else {
float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
r = make_float4(f, f, f, 1.0f);