diff options
Diffstat (limited to 'intern/cycles/kernel')
132 files changed, 7386 insertions, 1576 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index dbc2ba2503a..23e9bd311c4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -10,7 +10,23 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp kernels/opencl/kernel.cl kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_split.cl @@ -21,17 +37,22 @@ set(SRC kernels/opencl/kernel_lamp_emission.cl kernels/opencl/kernel_do_volume.cl kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl kernels/opencl/kernel_subsurface_scatter.cl kernels/opencl/kernel_direct_lighting.cl kernels/opencl/kernel_shadow_blocked_ao.cl kernels/opencl/kernel_shadow_blocked_dl.cl + kernels/opencl/kernel_enqueue_inactive.cl kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_indirect_subsurface.cl kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu ) set(SRC_BVH_HEADERS @@ -93,12 +114,18 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h ) set(SRC_KERNELS_CUDA_HEADERS kernels/cuda/kernel_config.h ) +set(SRC_KERNELS_OPENCL_HEADERS + kernels/opencl/kernel_split_function.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -120,6 +147,8 @@ set(SRC_CLOSURE_HEADERS closure/bssrdf.h closure/emissive.h closure/volume.h + closure/bsdf_principled_diffuse.h + closure/bsdf_principled_sheen.h ) set(SRC_SVM_HEADERS @@ -186,6 +215,21 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h @@ -194,17 +238,52 @@ set(SRC_UTIL_HEADERS ../util/util_math.h ../util/util_math_fast.h ../util/util_math_intersect.h + ../util/util_math_float2.h + ../util/util_math_float3.h + ../util/util_math_float4.h + ../util/util_math_int2.h + ../util/util_math_int3.h + ../util/util_math_int4.h + ../util/util_math_matrix.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h ../util/util_types.h + ../util/util_types_float2.h + ../util/util_types_float2_impl.h + ../util/util_types_float3.h + ../util/util_types_float3_impl.h + ../util/util_types_float4.h + ../util/util_types_float4_impl.h + ../util/util_types_int2.h + ../util/util_types_int2_impl.h + ../util/util_types_int3.h + ../util/util_types_int3_impl.h + ../util/util_types_int4.h + ../util/util_types_int4_impl.h + ../util/util_types_uchar2.h + ../util/util_types_uchar2_impl.h + ../util/util_types_uchar3.h + ../util/util_types_uchar3_impl.h + ../util/util_types_uchar4.h + ../util/util_types_uchar4_impl.h + ../util/util_types_uint2.h + ../util/util_types_uint2_impl.h + ../util/util_types_uint3.h + ../util/util_types_uint3_impl.h + ../util/util_types_uint4.h + ../util/util_types_uint4_impl.h + ../util/util_types_vector3.h + ../util/util_types_vector3_impl.h ) set(SRC_SPLIT_HEADERS + split/kernel_branched.h split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h split/kernel_do_volume.h + split/kernel_enqueue_inactive.h split/kernel_holdout_emission_blurring_pathtermination_ao.h split/kernel_indirect_background.h split/kernel_indirect_subsurface.h @@ -213,6 +292,8 @@ set(SRC_SPLIT_HEADERS split/kernel_path_init.h split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h split/kernel_shadow_blocked_ao.h split/kernel_shadow_blocked_dl.h @@ -256,23 +337,21 @@ if(WITH_CYCLES_CUDA_BINARIES) ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) - if(${split}) - set(cuda_extra_flags "-D__SPLIT__") - set(cuda_cubin kernel_split) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel) - endif() - + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) if(${experimental}) - set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) - set(cuda_cubin ${cuda_cubin}_experimental) + set(flags ${flags} -D__KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() - set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + set(cuda_cubin ${name}_${arch}.cubin) if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") @@ -286,11 +365,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") - if(split) - set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") - else() - set(cuda_kernel_src "/kernels/cuda/kernel.cu") - endif() + set(cuda_kernel_src "/kernels/cuda/${name}.cu") add_custom_command( OUTPUT ${cuda_cubin} @@ -304,13 +379,13 @@ if(WITH_CYCLES_CUDA_BINARIES) ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} - ${cuda_extra_flags} + ${flags} ${cuda_debug_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC - DEPENDS ${cuda_sources}) + DEPENDS ${sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) @@ -324,11 +399,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) # Compile split kernel - CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE) endif() endforeach() @@ -349,41 +425,30 @@ include_directories(SYSTEM ${INC_SYS}) set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - kernels/cpu/kernel_split_sse2.cpp - kernels/cpu/kernel_split_sse3.cpp - kernels/cpu/kernel_split_sse41.cpp - ) - set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - kernels/cpu/kernel_split_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - kernels/cpu/kernel_split_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel @@ -391,8 +456,10 @@ add_library(cycles_kernel ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -422,21 +489,28 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 9139b99353a..86a00d2124d 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -27,6 +27,8 @@ #include "kernel/closure/bsdf_ashikhmin_shirley.h" #include "kernel/closure/bsdf_toon.h" #include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #ifdef __SUBSURFACE__ # include "kernel/closure/bssrdf.h" #endif @@ -86,16 +88,21 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; @@ -130,6 +137,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -188,14 +206,19 @@ float3 bsdf_eval(KernelGlobals *kg, eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: @@ -222,6 +245,15 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -256,14 +288,19 @@ float3 bsdf_eval(KernelGlobals *kg, eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: @@ -290,6 +327,15 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) #ifdef __SVM__ switch(sc->type) { case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: bsdf_microfacet_multi_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; @@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: @@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: return bsdf_hair_merge(a, b); +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + return bsdf_principled_diffuse_merge(a, b); +#endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: return volume_henyey_greenstein_merge(a, b); @@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) #endif } +/* Classifies a closure as diffuse-like or specular-like. + * This is needed for the denoising feature pass generation, + * which are written on the first bounce where more than 25% + * of the sampling weight belongs to diffuse-line closures. */ +ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + return true; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc; + return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f); + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 1c7b3eb9ddd..b12e248f0a3 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -36,7 +36,8 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct MicrofacetExtra { - float3 color; + float3 color, cspec0; + float clearcoat; } MicrofacetExtra; typedef ccl_addr_space struct MicrofacetBsdf { @@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ @@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched( return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } +/* Calculate the reflection color + * + * If fresnel is used, the color is an interpolation of the F0 color and white + * with respect to the fresnel + * + * Else it is simply white + */ +ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) { + float3 F = make_float3(1.0f, 1.0f, 1.0f); + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + if(use_fresnel) { + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + + F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0); + } + + return F; +} + +ccl_device_forceinline float D_GTR1(float NdotH, float alpha) +{ + if(alpha >= 1.0f) return M_1_PI_F; + float alpha2 = alpha*alpha; + float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH; + return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t); +} + /* GGX microfacet with Smith shadow-masking from: * * Microfacet Models for Refraction through Rough Surfaces @@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched( ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b) { const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a; @@ -273,16 +341,38 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = saturate(bsdf->alpha_y); - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = saturate(bsdf->alpha_y); + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float alpha2 = alpha_x * alpha_y; float D, G1o, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic * eq. 20: (F*G*D)/(4*in*on) @@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } /* eq. 34: now calculate G1(i,m) and G1(o,m) */ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); @@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons /* eq. 20 */ float common = D * 0.25f / cosNO; - float out = G * common; + + float3 F = reflection_color(bsdf, omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + float3 out = F * G * common; /* eq. 2 in distribution of visible normals sampling * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ @@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons * pdf = pm * 0.25 / dot(m, I); */ *pdf = G1o * common; - return make_float3(out, out, out); + return out; } return make_float3(0.0f, 0.0f, 0.0f); @@ -489,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + /* if fresnel is used, calculate the color with reflection_color(...) */ + if(use_fresnel) { + *pdf = 1.0f; + *eval = reflection_color(bsdf, *omega_in, m); + } + label = LABEL_REFLECT | LABEL_SINGULAR; } else { @@ -497,16 +617,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure float alpha2 = alpha_x * alpha_y; float D, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic */ float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = 1/(cosThetaM2) - 1; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + + /* recalculate G1o */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } + /* eq. 34: now calculate G1(i,m) */ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); } @@ -538,10 +674,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* see eval function for derivation */ float common = (G1o * D) * 0.25f / cosNO; - float out = G1i * common; *pdf = common; - *eval = make_float3(out, out, out); + float3 F = reflection_color(bsdf, *omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + *eval = G1i * common * F; } #ifdef __RAY_DIFFERENTIALS__ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 7d87727004f..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { - if(cosI > 0.9999f || cosI < 1e-6f) { - const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); - const float phi = M_2PI_F * randU.y; + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); @@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } -/* === Phase functions: Glossy, Diffuse and Glass === */ +/* === Phase functions: Glossy and Glass === */ -/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */ -ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) +/* Phase function for reflective materials. */ +ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm) { - if(n && k) - *weight *= fresnel_conductor(dot(wi, wm), *n, *k); - return -wi + 2.0f * wm * dot(wi, wm); } -ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) +ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l else phase *= D_ggx_aniso(wh, alpha); - if(n && k) { - /* Apply conductive fresnel term. */ - return phase * fresnel_conductor(dotW_WH, *n, *k); - } - return make_float3(phase, phase, phase); } -/* Phase function for rough lambertian diffuse surfaces. */ -ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) -{ - float3 tm, bm; - make_orthonormals(wm, &tm, &bm); - - float2 disk = concentric_sample_disk(randu, randv); - return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm; -} - -ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) -{ - const float v = max(0.0f, dot(w, wm)) * M_1_PI_F; - return make_float3(v, v, v); -} - /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */ ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) { @@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r) return saturate(albedo); } +ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior) +{ + if(ior < 1.0f) { + ior = 1.0f/ior; + } + a = saturate(a); + ior = clamp(ior, 1.0f, 3.0f); + float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f; + float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f; + float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior); + float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f; + + return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f); +} + ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + float albedo = mf_ggx_albedo(alpha); - return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { - return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; -} + float D = D_ggx_aniso(normalize(wi+wo), alpha); + float lambda = mf_lambda(wi, alpha); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); -ccl_device_forceinline float mf_diffuse_pdf(const float3 wo) -{ - return M_1_PI_F * wo.z; + float multiscatter = wo.z * M_1_PI_F; + + float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y)); + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { - float3 wh; - float fresnel; - if(wi.z*wo.z > 0.0f) { - wh = normalize(wi + wo); - fresnel = fresnel_dielectric_cos(dot(wi, wh), eta); - } - else { - wh = normalize(wi + wo*eta); - fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta); - } + bool reflective = (wi.z*wo.z > 0.0f); + + float wh_len; + float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len); if(wh.z < 0.0f) wh = -wh; float3 r_wi = (wi.z < 0.0f)? -wi: wi; - return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z); + float lambda = mf_lambda(r_wi, make_float2(alpha, alpha)); + float D = D_ggx(wh, alpha); + float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta); + + float multiscatter = fabsf(wo.z * M_1_PI_F); + if(reflective) { + float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f); + float albedo = mf_ggx_albedo(alpha); + return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } + else { + float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f)); + float albedo = mf_ggx_transmission_albedo(alpha, eta); + return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } } /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */ @@ -315,13 +320,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons #define MF_MULTI_GLASS #include "kernel/closure/bsdf_microfacet_multi_impl.h" -/* The diffuse phase function is not implemented as a node yet. */ -#if 0 -#define MF_PHASE_FUNCTION diffuse -#define MF_MULTI_DIFFUSE -#include "kernel/closure/bsdf_microfacet_multi_impl.h" -#endif - #define MF_PHASE_FUNCTION glossy #define MF_MULTI_GLOSSY #include "kernel/closure/bsdf_microfacet_multi_impl.h" @@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf) bsdf->extra->color.x = saturate(bsdf->extra->color.x); bsdf->extra->color.y = saturate(bsdf->extra->color.y); bsdf->extra->color.z = saturate(bsdf->extra->color.z); - - bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } @@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf) if(is_zero(bsdf->T)) bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + if(is_zero(bsdf->T)) + bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf) { bsdf->alpha_y = bsdf->alpha_x; + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); float3 X, Y, Z; Z = bsdf->N; @@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else *pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x); - return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *omega_in = 2*dot(Z, I)*Z - I; *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); +#ifdef __RAY_DIFFERENTIALS__ + *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; + *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; +#endif return LABEL_REFLECT|LABEL_SINGULAR; } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); if(is_aniso) make_orthonormals_tangent(Z, bsdf->T, &X, &Y); @@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); if(is_aniso) *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else @@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; @@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } +ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f); + bsdf->alpha_y = bsdf->alpha_x; + bsdf->ior = max(0.0f, bsdf->ior); + bsdf->extra->color.x = saturate(bsdf->extra->color.x); + bsdf->extra->color.y = saturate(bsdf->extra->color.y); + bsdf->extra->color.z = saturate(bsdf->extra->color.z); + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; +} + ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; @@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color); } ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { @@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + float3 X, Y, Z; Z = bsdf->N; make_orthonormals(Z, &X, &Y); @@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S } } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + make_orthonormals(Z, &X, &Y); float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); *eval *= *pdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 8054fa8e849..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -26,19 +26,16 @@ * the balance heuristic isn't necessarily optimal anymore. */ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( - float3 wi, - float3 wo, - const bool wo_outside, - const float3 color, - const float alpha_x, - const float alpha_y, - ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { /* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */ bool swapped = false; @@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Analytically compute single scattering for lower noise. */ float3 eval; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + const float3 wh = normalize(wi+wo); #ifdef MF_MULTI_GLASS eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta); if(wo_outside) eval *= -lambda_r / (shadowing_lambda - lambda_r); else eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f); -#elif defined(MF_MULTI_DIFFUSE) - /* Diffuse has no special closed form for the single scattering bounce */ - eval = make_float3(0.0f, 0.0f, 0.0f); #else /* MF_MULTI_GLOSSY */ - const float3 wh = normalize(wi+wo); const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda); float val = G2 * 0.25f / wi.z; if(alpha.x == alpha.y) val *= D_ggx(wh, alpha.x); else val *= D_ggx_aniso(wh, alpha); - if(n && k) { - eval = fresnel_conductor(dot(wh, wi), *n, *k) * val; - } - else { - eval = make_float3(val, val, val); - } + eval = make_float3(val, val, val); #endif + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0); + + eval *= throughput; + } + float3 wr = -wi; float hr = 1.0f; float C1_r = 1.0f; float G1_r = 0.0f; bool outside = true; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); - -#ifdef MF_MULTI_DIFFUSE - if(order == 0) { - /* Compute single-scattering for diffuse. */ - const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r); - eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); + +#ifdef MF_MULTI_GLASS + if(order == 0 && use_fresnel) { + /* Evaluate amount of scattering towards wo on this microfacet. */ + float3 phase; + if(outside) + phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); + else + phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta); + + eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda); } #endif if(order > 0) { @@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); else phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta); -#elif defined(MF_MULTI_DIFFUSE) - phase = mf_eval_phase_diffuse(wo, wm); #else /* MF_MULTI_GLOSSY */ - phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput; + phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput; #endif eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda); } @@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; hr = -hr; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel && !next_outside) { + throughput *= color; + } + else if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif lambda_r = mf_lambda(wr, alpha); - throughput *= color; + if(!use_fresnel) + throughput *= color; C1_r = mf_C1(hr); G1_r = mf_G1(wr, C1_r, lambda_r); @@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( * escaped the surface in wo. The function returns the throughput between wi and wo. * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal. */ -ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( + float3 wi, + float3 *wo, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { const float2 alpha = make_float2(alpha_x, alpha_y); @@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 float G1_r = 0.0f; bool outside = true; + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0); + } + int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ - if(order > 0) + if(!use_fresnel && order > 0) throughput *= color; /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; outside = !outside; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel) { + if(!next_outside) { + throughput *= color; + } + else { + float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel) { + float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif /* Update random walk parameters. */ @@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 } #undef MF_MULTI_GLASS -#undef MF_MULTI_DIFFUSE #undef MF_MULTI_GLOSSY #undef MF_PHASE_FUNCTION diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h new file mode 100644 index 00000000000..f8ca64293b0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__ +#define __BSDF_PRINCIPLED_DIFFUSE_H__ + +/* DISNEY PRINCIPLED DIFFUSE BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledDiffuseBsdf { + SHADER_CLOSURE_BASE; + + float roughness; +} PrincipledDiffuseBsdf; + +ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = max(dot(N, L), 0.0f); + float NdotV = max(dot(N, V), 0.0f); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV); + const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness; + float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV); + + float value = M_1_PI_F * NdotL * Fd; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) +{ + const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a; + const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b; + + return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); +} + +ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h new file mode 100644 index 00000000000..f4476bfecd0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_SHEEN_H__ +#define __BSDF_PRINCIPLED_SHEEN_H__ + +/* DISNEY PRINCIPLED SHEEN BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledSheenBsdf { + SHADER_CLOSURE_BASE; +} PrincipledSheenBsdf; + +ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = dot(N, L); + float NdotV = dot(N, V); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float value = schlick_fresnel(LdotH) * NdotL; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b0c5280b6cb..3dc15d5791c 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k return(Rparl2 + Rperp2) * 0.5f; } +ccl_device float schlick_fresnel(float u) +{ + float m = clamp(1.0f - u, 0.0f, 1.0f); + float m2 = m * m; + return m2 * m2 * m; // pow(m, 5) +} + ccl_device float smooth_step(float edge0, float edge1, float x) { float result; @@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x) return result; } +/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */ +ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) { + /* Calculate the fresnel interpolation factor + * The value from fresnel_dielectric_cos(...) has to be normalized because + * the cspec0 keeps the F0 color + */ + float F0_norm = 1.0f / (1.0f - F0); + float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm; + + /* Blend between white and a specular color with respect to the fresnel */ + return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH; +} + CCL_NAMESPACE_END #endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index af0bbd861a9..f733ea4c517 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf { float d; float texture_blur; float albedo; - float3 N; + float roughness; } Bssrdf; /* Planar Truncated Gaussian @@ -360,10 +360,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) { if(bssrdf->radius < BSSRDF_MIN_RADIUS) { /* revert to diffuse BSDF if radius too small */ - DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; - bsdf->N = bssrdf->N; - int flag = bsdf_diffuse_setup(bsdf); - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + int flag; +#ifdef __PRINCIPLED__ + if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + float roughness = bssrdf->roughness; + float3 N = bssrdf->N; + float3 weight = bssrdf->weight; + float sample_weight = bssrdf->sample_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf; + + bsdf->N = N; + bsdf->roughness = roughness; + bsdf->weight = weight; + bsdf->sample_weight = sample_weight; + flag = bsdf_principled_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + else +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; + bsdf->N = bssrdf->N; + flag = bsdf_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } + return flag; } else { @@ -371,7 +393,9 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) bssrdf->sharpness = saturate(bssrdf->sharpness); bssrdf->type = type; - if(type == CLOSURE_BSSRDF_BURLEY_ID) { + if(type == CLOSURE_BSSRDF_BURLEY_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_ID) + { bssrdf_burley_setup(bssrdf); } @@ -385,7 +409,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_cubic_sample(sc, xi, r, h); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) bssrdf_gaussian_sample(sc, xi, r, h); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ bssrdf_burley_sample(sc, xi, r, h); } @@ -395,7 +419,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) return bssrdf_cubic_pdf(sc, r); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) return bssrdf_gaussian_pdf(sc, r); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ return bssrdf_burley_pdf(sc, r); } diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..6226ed2c2ef --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = fabsf(ccl_get_feature(buffer, 0)); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + const ccl_global float *ccl_restrict transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + const ccl_global float *ccl_restrict p_buffer, + int2 q_pixel, + const ccl_global float *ccl_restrict q_buffer, + int pass_stride, + int rank, + float *design_row, + const ccl_global float *ccl_restrict transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..3185330994c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + __m128 y4 = _mm_set1_ps(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ + __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *features, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_sub_ps(features[i], mean[i]); + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_mask_ps(features[i], active_pixels); +} + +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *scales, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); + scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); + + scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); + + __m128 diff, scale; + diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[3] = _mm_mask_ps(scale, active_pixels); + + scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); + + diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[5] = _mm_mask_ps(scale, active_pixels); +} + +ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +{ + scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); + scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); + scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); + scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); + + scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); + scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..3e752bce68f --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,186 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ +#ifdef __KERNEL_SSE3__ + int aligned_lowx = (rect.x & ~(3)); + int aligned_highx = ((rect.z + 3) & ~(3)); +#endif + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { +#ifdef __KERNEL_SSE3__ + for(int x = aligned_lowx; x < aligned_highx; x+=4) { + _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + } +#else + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] += difference_image[y1*w+x]; + } +#endif + } + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + out_image[y*w+x] += difference_image[y*w+dx+x]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + accum_image[y*w+x] += weight; + out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { + int y = fy + filter_rect.y; + for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, w, h, + pass_stride, + buffer, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int4 rect, + int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] /= accum_image[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..2c5ac807051 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += difference_image[y1*w+x]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + if(out_image) { + accum_image[y*w+x] += sum; + out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + } + else { + accum_image[y*w+x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride, + int localIdx) +{ + int y = fy + filter_rect.y; + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + filter_rect.z*filter_rect.w, + dx, dy, w, h, + pass_stride, + buffer, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, int w) +{ + out_image[y*w+x] /= accum_image[y*w+x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..a0b89c1111f --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,211 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + if(use_split_variance) { + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + } + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if (sample > 1) { + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } +} + +ccl_device void kernel_filter_detect_outliers(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *out, + int4 rect, + int pass_stride) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + int idx = (y-rect.y)*buffer_w + (x-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + float ref = 2.0f*values[(int)(n*0.75f)]; + float fac = 1.0f; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } + out[idx ] = fac*image[idx]; + out[idx + pass_stride] = fac*image[idx + pass_stride]; + out[idx+2*pass_stride] = fac*image[idx+2*pass_stride]; +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..25a3025056c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int w, int h, + int pass_stride, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + if(weight < 1e-3f) { + return; + } + + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); + +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); + + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues, + * fall back to this value. */ + float3 mean_color = XtWY[0]/XtWX[0]; + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + if(!isfinite3_safe(final_color)) { + final_color = mean_color; + } + + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..a5f87c05ec0 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..83a1222bbdb --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + const ccl_global float *ccl_restrict pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..30dc2969b11 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + __m128 features[DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + __m128 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + } + + __m128 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 8888000f0e6..5c3b0ee3c15 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I r_ext = mw_extension + r_curr; #ifdef __KERNEL_SSE__ const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)); + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); float d = dxxx.x; #else float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 47778553b94..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 06c0fb2fbca..84a988f1dbc 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg, #define KERNEL_ARCH cpu #include "kernel/kernels/cpu/kernel_cpu.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 823d30dde78..9ed16aceb55 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -220,8 +220,16 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) #ifdef __SHADOW_TRICKS__ L->path_total = make_float3(0.0f, 0.0f, 0.0f); L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); - L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -277,15 +285,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 alpha, float3 bsdf, - float3 ao, - int bounce) + float3 ao) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -302,31 +310,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + } #endif } ccl_device_inline void path_radiance_accum_total_ao( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 bsdf) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf; #endif } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -352,21 +372,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + } #endif } ccl_device_inline void path_radiance_accum_total_light( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, const BsdfEval *bsdf_eval) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf_eval->sum_no_mis; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf_eval; #endif @@ -393,11 +419,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { + L->path_total_shaded += throughput * value; + } } #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) @@ -555,29 +587,79 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi return L_sum; } +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) { float fac = 1.0f/num_samples; +#ifdef __SPLIT_KERNEL__ +# define safe_float3_add(f, v) \ + do { \ + ccl_global float *p = (ccl_global float*)(&(f)); \ + atomic_add_and_fetch_float(p+0, (v).x); \ + atomic_add_and_fetch_float(p+1, (v).y); \ + atomic_add_and_fetch_float(p+2, (v).z); \ + } while(0) +#else +# define safe_float3_add(f, v) (f) += (v) +#endif /* __SPLIT_KERNEL__ */ + #ifdef __PASSES__ - L->direct_diffuse += L_sample->direct_diffuse*fac; - L->direct_glossy += L_sample->direct_glossy*fac; - L->direct_transmission += L_sample->direct_transmission*fac; - L->direct_subsurface += L_sample->direct_subsurface*fac; - L->direct_scatter += L_sample->direct_scatter*fac; - - L->indirect_diffuse += L_sample->indirect_diffuse*fac; - L->indirect_glossy += L_sample->indirect_glossy*fac; - L->indirect_transmission += L_sample->indirect_transmission*fac; - L->indirect_subsurface += L_sample->indirect_subsurface*fac; - L->indirect_scatter += L_sample->indirect_scatter*fac; - - L->background += L_sample->background*fac; - L->ao += L_sample->ao*fac; - L->shadow += L_sample->shadow*fac; + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac); + + safe_float3_add(L->background, L_sample->background*fac); + safe_float3_add(L->ao, L_sample->ao*fac); + safe_float3_add(L->shadow, L_sample->shadow*fac); +# ifdef __SPLIT_KERNEL__ + atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac); +# else L->mist += L_sample->mist*fac; -#endif - L->emission += L_sample->emission * fac; +# endif /* __SPLIT_KERNEL__ */ +#endif /* __PASSES__ */ + safe_float3_add(L->emission, L_sample->emission*fac); + +#undef safe_float3_add } #ifdef __SHADOW_TRICKS__ @@ -595,16 +677,17 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) /* Calculate final light sum and transparency for shadow catcher object. */ ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, const PathRadiance *L, - ccl_addr_space float* L_transparent) + float* alpha) { const float shadow = path_radiance_sum_shadow(L); float3 L_sum; if(kernel_data.background.transparent) { - *L_transparent = shadow; - L_sum = make_float3(0.0f, 0.0f, 0.0f); + *alpha = 1.0f - L->shadow_throughput * shadow; + L_sum = L->shadow_radiance_sum; } else { - L_sum = L->shadow_color * shadow; + L_sum = L->shadow_background_color * L->shadow_throughput * shadow + + L->shadow_radiance_sum; } return L_sum; } diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 21da180bb8e..93934ee6b38 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -195,7 +195,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: ix = wrap_clamp(ix, width); iy = wrap_clamp(iy, height); @@ -222,7 +222,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: nix = wrap_clamp(ix+1, width); niy = wrap_clamp(iy+1, height); @@ -265,7 +265,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: pix = wrap_clamp(ix-1, width); piy = wrap_clamp(iy-1, height); @@ -335,7 +335,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: ix = wrap_clamp(ix, width); iy = wrap_clamp(iy, height); @@ -374,7 +374,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: nix = wrap_clamp(ix+1, width); niy = wrap_clamp(iy+1, height); @@ -449,7 +449,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: pix = wrap_clamp(ix-1, width); piy = wrap_clamp(iy-1, height); diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index c375d17a95f..38708f7ff0b 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,6 +55,11 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index c2263ac0d49..4836c290312 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,6 +50,8 @@ # define ccl_addr_space #endif +#define ATTR_FALLTHROUGH + #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index c9c97ea977e..f95f0d98c52 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -19,6 +19,10 @@ #ifndef __KERNEL_GLOBALS_H__ #define __KERNEL_GLOBALS_H__ +#ifdef __KERNEL_CPU__ +# include "util/util_vector.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -38,12 +42,12 @@ struct Intersection; struct VolumeStep; typedef struct KernelGlobals { - texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU]; - texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU]; - texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU]; - texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU]; - texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU]; - texture_image_half texture_half_images[TEX_NUM_HALF_CPU]; + vector<texture_image_float4> texture_float4_images; + vector<texture_image_uchar4> texture_byte4_images; + vector<texture_image_half4> texture_half4_images; + vector<texture_image_float> texture_float_images; + vector<texture_image_uchar> texture_byte_images; + vector<texture_image_half> texture_half_images; # define KERNEL_TEX(type, ttype, name) ttype name; # define KERNEL_IMAGE_TEX(type, ttype, name) diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h index 0352c58037d..90747e09357 100644 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ b/intern/cycles/kernel/kernel_image_opencl.h @@ -20,18 +20,19 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) { + const int texture_type = kernel_tex_type(id); /* Float4 */ - if(id < TEX_START_BYTE4_OPENCL) { + if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { return kernel_tex_fetch(__tex_image_float4_packed, offset); } /* Byte4 */ - else if(id < TEX_START_FLOAT_OPENCL) { + else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); float f = 1.0f/255.0f; return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); } /* Float */ - else if(id < TEX_START_BYTE_OPENCL) { + else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { float f = kernel_tex_fetch(__tex_image_float_packed, offset); return make_float4(f, f, f, 1.0f); } @@ -63,23 +64,34 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix) return x - (float)i; } +ccl_device_inline uint kernel_decode_image_interpolation(uint4 info) +{ + return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; +} + +ccl_device_inline uint kernel_decode_image_extension(uint4 info) +{ + if(info.w & (1 << 1)) { + return EXTENSION_REPEAT; + } + else if(info.w & (1 << 2)) { + return EXTENSION_EXTEND; + } + else { + return EXTENSION_CLIP; + } +} + ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); uint width = info.x; uint height = info.y; uint offset = info.z; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - + /* Decode image options. */ + uint interpolation = kernel_decode_image_interpolation(info); + uint extension = kernel_decode_image_extension(info); + /* Actual sampling. */ float4 r; int ix, iy, nix, niy; if(interpolation == INTERPOLATION_CLOSEST) { @@ -132,7 +144,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); } - return r; } @@ -144,17 +155,10 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, uint height = info.y; uint offset = info.z; uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - + /* Decode image options. */ + uint interpolation = kernel_decode_image_interpolation(info); + uint extension = kernel_decode_image_extension(info); + /* Actual sampling. */ float4 r; int ix, iy, iz, nix, niy, niz; if(interpolation == INTERPOLATION_CLOSEST) { @@ -171,7 +175,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, if(extension == EXTENSION_CLIP) { if(x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) - { + { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } } @@ -198,12 +202,13 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, niz = svm_image_texture_wrap_periodic(iz+1, depth); } else { - if(extension == EXTENSION_CLIP) + if(extension == EXTENSION_CLIP) { if(x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } + } /* Fall through. */ /* EXTENSION_EXTEND */ nix = svm_image_texture_wrap_clamp(ix+1, width); @@ -224,8 +229,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); - } - return r; } diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 67546131746..f5855757d3f 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p) return (x + jx)*invN; } -ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */ +ccl_device_inline int cmj_isqrt(int value) { - kernel_assert(s < N); - #if defined(__KERNEL_CUDA__) - int m = float_to_int(__fsqrt_ru(N)); + return float_to_int(__fsqrt_ru(value)); +#elif defined(__KERNEL_GPU__) + return float_to_int(sqrtf(value)); #else - int m = float_to_int(sqrtf(N)); + /* This is a work around for fast-math on CPU which might replace sqrtf() + * with am approximated version. + */ + return float_to_int(sqrtf(value) + 1e-6f); #endif +} + +ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +{ + kernel_assert(s < N); + + int m = cmj_isqrt(N); int n = (N - 1)/m + 1; float invN = 1.0f/N; float invm = 1.0f/m; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index a2909cec1a1..9baa9d54957 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index ed523696571..9cd7ffb181d 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -60,6 +60,140 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #endif /* __SPLIT_KERNEL__ */ } +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value) +{ + kernel_write_pass_float(buffer, sample, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+1, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float(buffer+1, sample, 0.0f); + } + else { + float new_mean = buffer[0] * (1.0f / (sample + 1)); + float old_mean = (buffer[0] - value) * (1.0f / sample); + kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +# if defined(__SPLIT_KERNEL__) +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value) +{ + buffer[0] = (sample == 0)? value.x: buffer[0] + value.x; + buffer[1] = (sample == 0)? value.y: buffer[1] + value.y; + buffer[2] = (sample == 0)? value.z: buffer[2] + value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, sample, value); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f)); + } + else { + float3 sum = make_float3(buffer[0], buffer[1], buffer[2]); + float3 new_mean = sum * (1.0f / (sample + 1)); + float3 old_mean = (sum - value) * (1.0f / sample); + kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, sample/2, path_total); + kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+2, sample/2, value*value); +# else + if(sample < 2) { + kernel_write_pass_float(buffer+2, sample/2, 0.0f); + } + else { + float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f); + float new_value = buffer[1] / max(buffer[0], 1e-7f); + kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value)); + } +# endif +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + /* Skip implicitly transparent surfaces. */ + if(sd->flag & SD_HAS_ONLY_VOLUME) { + return; + } + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(!bsdf_is_specular_like(sc)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { @@ -199,5 +333,88 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f #endif } +ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, + int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) +{ + if(L) { + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, L); + } + + kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L, sample); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + noisy = L_sum; + clean = make_float3(0.0f, 0.0f, 0.0f); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + path_radiance_split_denoising(kg, L, &noisy, &clean); + } + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, noisy); + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, ensure_finite3(L_sum)); + } + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, L->denoising_normal); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, L->denoising_albedo); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + } + else { + kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, make_float3(0.0f, 0.0f, 0.0f)); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, 0.0f); + + if(kernel_data.film.pass_denoising_clean) { + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, make_float3(0.0f, 0.0f, 0.0f)); + } + } +#endif /* __DENOISING_FEATURES__ */ + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index e7957042182..c340b3bc968 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput, float3 ao_alpha) @@ -90,14 +90,16 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } +#ifndef __SPLIT_KERNEL__ + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, @@ -364,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { @@ -403,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) +#if defined(__EMISSION__) if(kernel_data.integrator.use_direct_light) { int all = (kernel_data.integrator.sample_all_lights_indirect) || (state->flag & PATH_RAY_SHADOW_CATCHER); @@ -417,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, L, all); } -#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */ +#endif /* defined(__EMISSION__) */ if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) break; @@ -425,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) +ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -515,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float3 emission; if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __LAMP_MIS__ */ @@ -547,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* emission */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); /* scattering */ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; @@ -557,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just @@ -575,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -589,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -621,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -638,11 +643,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } else { @@ -675,7 +685,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ @@ -693,7 +703,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_EMISSION) { /* todo: is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -713,10 +723,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -727,7 +739,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(kernel_path_subsurface_scatter(kg, &sd, &emission_sd, - &L, + L, &state, rng, &ray, @@ -740,15 +752,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); + kernel_path_subsurface_accum_indirect(&ss_indirect, L); /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. @@ -758,7 +770,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, &ss_indirect, &state, &ray, - &L, + L, &throughput); } else { @@ -767,24 +779,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -805,18 +808,21 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 36fd6c95fe7..77d4f1df447 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput) { @@ -56,29 +56,48 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); } } } } +#ifndef __SPLIT_KERNEL__ /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + for(int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -110,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba &tp, &ps, L, - &bsdf_ray)) + &bsdf_ray, + sum_sample_weight)) { continue; } @@ -242,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -329,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = kernel_data.integrator.sample_all_lights_direct; kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect light sampling */ @@ -337,11 +362,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in float num_samples_inv = 1.0f/num_samples; for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); - PathState ps = state; Ray pray = ray; float3 tp = throughput; @@ -352,8 +372,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); @@ -366,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -377,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp*num_samples_inv, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } } /* emission and transmittance */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); throughput *= volume_segment.accum_transmittance; /* free cached steps */ @@ -411,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, rng, &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -435,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } #endif /* __VOLUME_SCATTER__ */ @@ -466,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -479,13 +499,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); - state.catcher_object = sd.object; - if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); - } + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); + state.catcher_object = sd.object; + if(!kernel_data.background.transparent) { + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } else { state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; @@ -513,13 +536,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); #ifdef __EMISSION__ /* emission */ if(sd.flag & SD_EMISSION) { float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -543,10 +566,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); } #endif /* __AO__ */ @@ -554,7 +579,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + L, &state, rng, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -567,13 +592,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); + &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -602,24 +627,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __VOLUME__ */ } - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -640,20 +656,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index c0cd2a63120..5d92fd12201 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -128,6 +138,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + +#ifdef __DENOISING_FEATURES__ + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } +#endif } ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index 076c82f3853..dcb577e176f 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ ccl_device_noinline void kernel_branched_path_surface_connect_light( KernelGlobals *kg, @@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce( ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, - Ray *ray) + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce( /* modify throughput */ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif + /* modify path state */ path_state_next(kg, state, label); @@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 371f2c1c7cb..dcedf51e479 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } @@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..cbb2442d1dc 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index 96bc636d5ac..e32d4bbbc1b 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index( return my_gqidx; } +ccl_device int dequeue_ray_index( + int queue_number, + ccl_global int *queues, + int queue_size, + ccl_global int *queue_index) +{ + int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1; + + if(index < 0) { + return QUEUE_EMPTY_SLOT; + } + + return queues[index + queue_number * queue_size]; +} + CCL_NAMESPACE_END #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index d4f0caff5de..e8a912ccc0b 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -20,14 +20,15 @@ CCL_NAMESPACE_BEGIN #ifdef __SOBOL__ -/* skip initial numbers that are not as well distributed, especially the +/* Skip initial numbers that are not as well distributed, especially the * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination */ + * path termination. + */ #define SOBOL_SKIP 64 -/* High Dimensional Sobol */ +/* High Dimensional Sobol. */ -/* van der corput radical inverse */ +/* Van der Corput radical inverse. */ ccl_device uint van_der_corput(uint bits) { bits = (bits << 16) | (bits >> 16); @@ -38,58 +39,63 @@ ccl_device uint van_der_corput(uint bits) return bits; } -/* sobol radical inverse */ +/* Sobol radical inverse. */ ccl_device uint sobol(uint i) { uint r = 0; - - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) - if(i & 1) + for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) { + if(i & 1) { r ^= v; - + } + } return r; } -/* inverse of sobol radical inverse */ +/* Inverse of sobol radical inverse. */ ccl_device uint sobol_inverse(uint i) { const uint msb = 1U << 31; uint r = 0; - - for(uint v = 1; i; i <<= 1, v ^= v << 1) - if(i & msb) + for(uint v = 1; i; i <<= 1, v ^= v << 1) { + if(i & msb) { r ^= v; - + } + } return r; } -/* multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */ +/* Multidimensional sobol with generator matrices + * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively. + */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; uint i = index; - - for(uint j = 0; i; i >>= 1, j++) - if(i & 1) + for(uint j = 0; i; i >>= 1, j++) { + if(i & 1) { result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j); - + } + } return result; } -/* lookup index and x/y coordinate, assumes m is a power of two */ -ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y) +/* Lookup index and x/y coordinate, assumes m is a power of two. */ +ccl_device uint sobol_lookup(const uint m, + const uint frame, + const uint ex, + const uint ey, + uint *x, uint *y) { - /* shift is constant per frame */ + /* Shift is constant per frame. */ const uint shift = frame << (m << 1); const uint sobol_shift = sobol(shift); - /* van der Corput is its own inverse */ + /* Van der Corput is its own inverse. */ const uint lower = van_der_corput(ex << (32 - m)); - /* need to compensate for ey difference and shift */ + /* Need to compensate for ey difference and shift. */ const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */ + const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */ const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* only use m upper bits for the index (m is a power of two) */ + /* Only use m upper bits for the index (m is a power of two). */ const uint sobol_result = delta | (delta >> m); const uint upper = sobol_inverse(sobol_result); const uint index = shift | upper | lower; @@ -98,11 +104,14 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ + /* Correlated multi-jitter. */ int p = *rng + dimension; return cmj_sample_1D(sample, num_samples, p); } @@ -113,7 +122,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample float r = (float)result * (1.0f/(float)0xFFFFFFFF); return r; #else - /* compute sobol sequence value using direction vectors */ + /* Compute sobol sequence value using direction vectors. */ uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); @@ -130,24 +139,33 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ + /* Correlated multi-jitter. */ int p = *rng + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); } else #endif { - /* sobol */ + /* Sobol. */ *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } } -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, int num_samples, + RNG *rng, + int x, int y, + float *fx, float *fy) { #ifdef __SOBOL_FULL_SCREEN__ uint px, py; @@ -182,29 +200,43 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta #endif } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, + ccl_global uint *rng_state, + RNG rng) { /* nothing to do */ } -#else +#else /* __SOBOL__ */ /* Linear Congruential Generator */ -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension) { /* implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } -ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device void path_rng_init(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, int num_samples, + RNG *rng, + int x, int y, + float *fx, float *fy) { /* load state */ *rng = *rng_state; @@ -220,13 +252,15 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int } } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, + ccl_global uint *rng_state, + RNG rng) { /* store state for next sample */ *rng_state = rng; } -#endif +#endif /* __SOBOL__ */ /* Linear Congruential Generator */ @@ -257,49 +291,108 @@ ccl_device uint lcg_init(uint seed) * dimension to avoid using the same sequence twice. * * For branches in the path we must be careful not to reuse the same number - * in a sequence and offset accordingly. */ + * in a sequence and offset accordingly. + */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension) { - return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); + return path_rng_1D(kg, + rng, + state->sample, state->num_samples, + state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D_for_decision( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension) { - /* the rng_offset is not increased for transparent bounces. if we do then + /* The rng_offset is not increased for transparent bounces. if we do then * fully transparent objects can become subtly visible by the different * sampling patterns used where the transparent object is. * * however for some random numbers that will determine if we next bounce * is transparent we do need to increase the offset to avoid always making - * the same decision */ - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); + * the same decision. */ + const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; + return path_rng_1D(kg, + rng, + state->sample, state->num_samples, + rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng, + state->sample, state->num_samples, + state->rng_offset + dimension, + fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); + return path_rng_1D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D_for_decision( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); + const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; + return path_rng_1D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension, + fx, fy); } -/* Utitility functions to get light termination value, since it might not be needed in many cases. */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state) +/* Utitility functions to get light termination value, + * since it might not be needed in many cases. + */ +ccl_device_inline float path_state_rng_light_termination( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); @@ -307,15 +400,27 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG return 0.0f; } -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches) +ccl_device_inline float path_branched_rng_light_termination( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D_for_decision(kg, + rng, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches) +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, + int branch, + int num_branches) { /* path is splitting into a branch, adjust so that each branch * still gets a unique sample from the same sequence */ @@ -324,14 +429,17 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int br state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, + int rng_offset, + int sample, + uint scramble) { return lcg_init(*rng + rng_offset + sample*scramble); } ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { - /* implicit mod 2^32 */ + /* Implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 8c0c5e90a3e..c66f52255f0 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ @@ -186,7 +186,7 @@ void shader_setup_from_subsurface( sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index db6f839d9ed..fab5946970d 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -379,7 +379,7 @@ ccl_device bool shadow_blocked_transparent_stepped( float3 *shadow) { bool blocked, is_transparent_isect; - if (skip_object == OBJECT_NONE) { + if(skip_object == OBJECT_NONE) { blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index f75e9337bdb..6475d4b66fd 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; sd->randb_closure = 0.0f; @@ -148,15 +148,35 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei sd->num_closure_extra = 0; if(hit) { - DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); - - if(bsdf) { - bsdf->N = N; - sd->flag |= bsdf_diffuse_setup(bsdf); - - /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes - * can recognize it as not being a regular diffuse closure */ - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + Bssrdf *bssrdf = (Bssrdf *)sc; +#ifdef __PRINCIPLED__ + if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = bssrdf->roughness; + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular Disney principled diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + } + else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) || + CLOSURE_IS_BSSRDF(bssrdf->type)) +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + sd->flag |= bsdf_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } } } } @@ -379,6 +399,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup( #else Ray *ray = &ss_isect->ray; #endif + + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif + /* Setup new shading point. */ shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); @@ -388,12 +414,11 @@ ccl_device_noinline void subsurface_scatter_multi_setup( subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N); /* Setup diffuse BSDF. */ - subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N); } -#ifndef __SPLIT_KERNEL__ /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, +ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -454,6 +479,10 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS if(ss_isect.num_hits > 0) { float3 origP = sd->P; + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif /* setup new shading point */ shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray); @@ -479,9 +508,8 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N); /* setup diffuse bsdf */ - subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N); } -#endif /* ! __SPLIT_KERNEL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index cb1a3f40dee..aa5b32803a5 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -82,10 +82,10 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions) # if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) @@ -93,91 +93,93 @@ KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) +/* image + * These texture names are encoded to their flattened slots as + * ImageManager::type_index_to_flattened_slot() returns them. */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) # else /* bindless textures */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 623f3728c69..e6a62c42a38 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN # define WORK_POOL_SIZE WORK_POOL_SIZE_CPU #endif + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -71,21 +83,18 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifndef __SPLIT_KERNEL__ -# define __BRANCHED_PATH__ -# endif +# define __BRANCHED_PATH__ # ifdef WITH_OSL # define __OSL__ # endif +# define __PRINCIPLED__ # define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# ifndef __SPLIT_KERNEL__ -# define __VOLUME_DECOUPLED__ -# define __VOLUME_RECORD_ALL__ -# endif +# define __VOLUME_DECOUPLED__ +# define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ @@ -94,10 +103,11 @@ CCL_NAMESPACE_BEGIN # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __SHADOW_RECORD_ALL__ +# define __CMJ__ # ifndef __SPLIT_KERNEL__ # define __BRANCHED_PATH__ -# define __CMJ__ # endif #endif /* __KERNEL_CUDA__ */ @@ -109,43 +119,44 @@ CCL_NAMESPACE_BEGIN # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_NVIDIA__ */ # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ +# define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support * custom cflags depending on the scene features. */ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif -# endif /* __KERNEL_OPENCL_NVIDIA__ */ +# endif /* __KERNEL_OPENCL_APPLE__ */ # ifdef __KERNEL_OPENCL_AMD__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __PRINCIPLED__ +# define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ #endif /* __KERNEL_OPENCL__ */ @@ -165,6 +176,8 @@ CCL_NAMESPACE_BEGIN #define __PATCH_EVAL__ #define __SHADOW_TRICKS__ +#define __DENOISING_FEATURES__ + #ifdef __KERNEL_SHADING__ # define __SVM__ # define __EMISSION__ @@ -220,7 +233,13 @@ CCL_NAMESPACE_BEGIN # undef __TRANSPARENT_SHADOWS__ #endif #ifdef __NO_SHADOW_TRICKS__ -#undef __SHADOW_TRICKS__ +# undef __SHADOW_TRICKS__ +#endif +#ifdef __NO_PRINCIPLED__ +# undef __PRINCIPLED__ +#endif +#ifdef __NO_DENOISING__ +# undef __DENOISING_FEATURES__ #endif /* Random Numbers */ @@ -303,31 +322,32 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE = (1 << 7), + PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, + PATH_RAY_NODE_UNALIGNED = (1 << 11), - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_SHADOW_CATCHER = 32768, - PATH_RAY_SHADOW_CATCHER_ONLY = 65536, + PATH_RAY_MIS_SKIP = (1 << 12), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), + PATH_RAY_SINGLE_PASS_DONE = (1 << 14), + PATH_RAY_SHADOW_CATCHER = (1 << 15), + PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), + PATH_RAY_STORE_SHADOW_INFO = (1 << 17), }; /* Closure Label */ @@ -383,6 +403,22 @@ typedef enum PassType { #define PASS_ALL (~0) +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + typedef enum BakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), @@ -416,6 +452,18 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; @@ -469,8 +517,20 @@ typedef ccl_addr_space struct PathRadiance { float3 path_total_shaded; /* Color of the background on which shadow is alpha-overed. */ - float3 shadow_color; + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float3 shadow_radiance_sum; + float shadow_throughput; #endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ } PathRadiance; typedef struct BsdfEval { @@ -713,12 +773,13 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; /* Shader Context @@ -949,6 +1010,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -1126,6 +1191,11 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + int pad; + #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; @@ -1298,7 +1368,6 @@ typedef ccl_addr_space struct DebugData { * Queue 3 - Shadow ray cast kernel - AO * Queeu 4 - Shadow ray cast kernel - direct lighting */ -#define NUM_QUEUES 4 /* Queue names */ enum QueueNumber { @@ -1311,22 +1380,42 @@ enum QueueNumber { * 3. Rays to be regenerated * are enqueued here. */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contribution for AO are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contributing for direct lighting are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + +#ifdef __BRANCHED_PATH__ + /* All rays moving to next iteration of the indirect loop for light */ + QUEUE_LIGHT_INDIRECT_ITER, + /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ + QUEUE_INACTIVE_RAYS, +# ifdef __VOLUME__ + /* All rays moving to next iteration of the indirect loop for volumes */ + QUEUE_VOLUME_INDIRECT_ITER, +# endif +# ifdef __SUBSURFACE__ + /* All rays moving to next iteration of the indirect loop for subsurface */ + QUEUE_SUBSURFACE_INDIRECT_ITER, +# endif +#endif /* __BRANCHED_PATH__ */ + + NUM_QUEUES }; -/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ -#define RAY_STATE_MASK 0x007 -#define RAY_FLAG_MASK 0x0F8 +/* We use RAY_STATE_MASK to get ray_state */ +#define RAY_STATE_MASK 0x0F +#define RAY_FLAG_MASK 0xF0 enum RayState { RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ @@ -1341,14 +1430,25 @@ enum RayState { RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ RAY_REGENERATED, - /* Flag's ray has to execute shadow blocked function in AO part */ - RAY_SHADOW_RAY_CAST_AO = 16, - /* Flag's ray has to execute shadow blocked function in direct lighting part. */ - RAY_SHADOW_RAY_CAST_DL = 32, + /* Denotes ray is moving to next iteration of the branched indirect loop */ + RAY_LIGHT_INDIRECT_NEXT_ITER, + RAY_VOLUME_INDIRECT_NEXT_ITER, + RAY_SUBSURFACE_INDIRECT_NEXT_ITER, + + /* Ray flags */ + + /* Flags to denote that the ray is currently evaluating the branched indirect loop */ + RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), + RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), + RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), + RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), + + /* Ray is evaluating an iteration of an indirect loop for another thread */ + RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 9c0878249d4..1e472aaf51a 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -660,6 +660,7 @@ typedef struct VolumeSegment { * but the entire segment is needed to do always scattering, rather than probabilistically * hitting or missing the volume. if we don't know the transmittance at the end of the * volume we can't generate stratified distance samples up to that transmittance */ +#ifdef __VOLUME_DECOUPLED__ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { @@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s #endif } } +#endif /* __VOLUME_DECOUPLED__ */ /* scattering for homogeneous and heterogeneous volumes, using decoupled ray * marching. diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..2ed713299fd --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int* rect, + int w, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int* rect, + int w); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..8dc1a8d583c --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,272 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); +#else + kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int *rect, + int w, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int *rect, + int w) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..1a7b2040da1 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 16992c681e6..998619ac897 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -95,9 +95,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_float4")) { texture_image_float4 *tex = NULL; int id = atoi(name + strlen("__tex_image_float4_")); - int array_index = id; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_float4_images.size()) { + kg->texture_float4_images.resize(array_index+1); + } tex = &kg->texture_float4_images[array_index]; } @@ -111,9 +114,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_float")) { texture_image_float *tex = NULL; int id = atoi(name + strlen("__tex_image_float_")); - int array_index = id - TEX_START_FLOAT_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_float_images.size()) { + kg->texture_float_images.resize(array_index+1); + } tex = &kg->texture_float_images[array_index]; } @@ -127,9 +133,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_byte4")) { texture_image_uchar4 *tex = NULL; int id = atoi(name + strlen("__tex_image_byte4_")); - int array_index = id - TEX_START_BYTE4_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_byte4_images.size()) { + kg->texture_byte4_images.resize(array_index+1); + } tex = &kg->texture_byte4_images[array_index]; } @@ -143,9 +152,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_byte")) { texture_image_uchar *tex = NULL; int id = atoi(name + strlen("__tex_image_byte_")); - int array_index = id - TEX_START_BYTE_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_byte_images.size()) { + kg->texture_byte_images.resize(array_index+1); + } tex = &kg->texture_byte_images[array_index]; } @@ -159,9 +171,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_half4")) { texture_image_half4 *tex = NULL; int id = atoi(name + strlen("__tex_image_half4_")); - int array_index = id - TEX_START_HALF4_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_half4_images.size()) { + kg->texture_half4_images.resize(array_index+1); + } tex = &kg->texture_half4_images[array_index]; } @@ -175,9 +190,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_half")) { texture_image_half *tex = NULL; int id = atoi(name + strlen("__tex_image_half_")); - int array_index = id - TEX_START_HALF_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_half_images.size()) { + kg->texture_half_images.resize(array_index+1); + } tex = &kg->texture_half_images[array_index]; } diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 2600d977972..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index dba15d037ac..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 896b80d783e..c8938534fe8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -77,16 +77,17 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); - #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index af68907a5c2..f6bb4c25012 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -23,51 +23,59 @@ CCL_NAMESPACE_BEGIN ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y); - else - return kg->texture_float4_images[tex].interp(x, y); + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y); + } } ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z); - else - return kg->texture_float4_images[tex].interp_3d(x, y, z); - + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + } } ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation); - else - return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation); + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 148b2eef568..d4315ee5ec4 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -22,38 +22,50 @@ #include "kernel/kernel_compat_cpu.h" -#ifndef __SPLIT_KERNEL__ -# include "kernel/kernel_math.h" -# include "kernel/kernel_types.h" - -# include "kernel/split/kernel_split_data.h" -# include "kernel/kernel_globals.h" - -# include "kernel/kernels/cpu/kernel_cpu_image.h" -# include "kernel/kernel_film.h" -# include "kernel/kernel_path.h" -# include "kernel/kernel_path_branched.h" -# include "kernel/kernel_bake.h" +#ifndef KERNEL_STUB +# ifndef __SPLIT_KERNEL__ +# include "kernel/kernel_math.h" +# include "kernel/kernel_types.h" + +# include "kernel/split/kernel_split_data.h" +# include "kernel/kernel_globals.h" + +# include "kernel/kernels/cpu/kernel_cpu_image.h" +# include "kernel/kernel_film.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" +# include "kernel/kernel_bake.h" +# else +# include "kernel/split/kernel_split_common.h" + +# include "kernel/split/kernel_data_init.h" +# include "kernel/split/kernel_path_init.h" +# include "kernel/split/kernel_scene_intersect.h" +# include "kernel/split/kernel_lamp_emission.h" +# include "kernel/split/kernel_do_volume.h" +# include "kernel/split/kernel_queue_enqueue.h" +# include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" +# include "kernel/split/kernel_shader_eval.h" +# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "kernel/split/kernel_subsurface_scatter.h" +# include "kernel/split/kernel_direct_lighting.h" +# include "kernel/split/kernel_shadow_blocked_ao.h" +# include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_enqueue_inactive.h" +# include "kernel/split/kernel_next_iteration_setup.h" +# include "kernel/split/kernel_indirect_subsurface.h" +# include "kernel/split/kernel_buffer_update.h" +# endif /* __SPLIT_KERNEL__ */ #else -# include "kernel/split/kernel_split_common.h" - -# include "kernel/split/kernel_data_init.h" -# include "kernel/split/kernel_path_init.h" -# include "kernel/split/kernel_scene_intersect.h" -# include "kernel/split/kernel_lamp_emission.h" -# include "kernel/split/kernel_do_volume.h" -# include "kernel/split/kernel_queue_enqueue.h" -# include "kernel/split/kernel_indirect_background.h" -# include "kernel/split/kernel_shader_eval.h" -# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -# include "kernel/split/kernel_subsurface_scatter.h" -# include "kernel/split/kernel_direct_lighting.h" -# include "kernel/split/kernel_shadow_blocked_ao.h" -# include "kernel/split/kernel_shadow_blocked_dl.h" -# include "kernel/split/kernel_next_iteration_setup.h" -# include "kernel/split/kernel_indirect_subsurface.h" -# include "kernel/split/kernel_buffer_update.h" -#endif +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) + +# ifdef __SPLIT_KERNEL__ +# include "kernel/split/kernel_data_init.h" +# endif /* __SPLIT_KERNEL__ */ +#endif /* KERNEL_STUB */ CCL_NAMESPACE_BEGIN @@ -69,7 +81,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, @@ -80,10 +95,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, stride); } else -#endif +# endif { kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -96,6 +112,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -103,6 +122,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -113,6 +133,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -120,6 +143,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -134,9 +158,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -145,7 +172,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif } else { kernel_shader_evaluate(kg, @@ -156,24 +183,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, sample); } +#endif /* KERNEL_STUB */ } #else /* __SPLIT_KERNEL__ */ /* Split Kernel Path Tracing */ -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ kernel_##name(kg); \ } -#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ ccl_local type locals; \ kernel_##name(kg, &locals); \ } +#endif /* KERNEL_STUB */ DEFINE_SPLIT_KERNEL_FUNCTION(path_init) DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) @@ -181,49 +223,22 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) - -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) -{ -#define REGISTER_NAME_STRING(name) #name -#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) -#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); - - REGISTER(path_trace); - REGISTER(convert_to_byte); - REGISTER(convert_to_half_float); - REGISTER(shader); - - REGISTER(data_init); - REGISTER(path_init); - REGISTER(scene_intersect); - REGISTER(lamp_emission); - REGISTER(do_volume); - REGISTER(queue_enqueue); - REGISTER(indirect_background); - REGISTER(shader_eval); - REGISTER(holdout_emission_blurring_pathtermination_ao); - REGISTER(subsurface_scatter); - REGISTER(direct_lighting); - REGISTER(shadow_blocked_ao); - REGISTER(shadow_blocked_dl); - REGISTER(next_iteration_setup); - REGISTER(indirect_subsurface); - REGISTER(buffer_update); - -#undef REGISTER -#undef REGISTER_EVAL_NAME -#undef REGISTER_NAME_STRING -} - #endif /* __SPLIT_KERNEL__ */ +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp index 27a746a0799..6ba3425a343 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -17,22 +17,25 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp index 364d279a189..76b2d77ebb8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -18,23 +18,25 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp index 0afb481296f..b468b6f44c8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp index 13d00813591..3e5792d0b17 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -18,19 +18,21 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp index a4312071edc..3629f21cd29 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -18,20 +18,22 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index 1acfaa91ac9..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index f7b6a2e21fe..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index 1900c6e3012..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu//kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..009c3fde9d5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,255 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_detect_outliers(float *image, + float *variance, + float *depth, + float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, float *accum_image, + int4 rect, int w, + int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); + int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(int w, int h, + float *buffer, int *rank, + float *XtWX, float3 *XtWY, + int4 filter_area, int4 buffer_params, + int sample) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index a679eff8409..628891b1458 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -31,12 +31,15 @@ #include "kernel/split/kernel_do_volume.h" #include "kernel/split/kernel_queue_enqueue.h" #include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" #include "kernel/split/kernel_shader_eval.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" #include "kernel/split/kernel_subsurface_scatter.h" #include "kernel/split/kernel_direct_lighting.h" #include "kernel/split/kernel_shadow_blocked_ao.h" #include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_enqueue_inactive.h" #include "kernel/split/kernel_next_iteration_setup.h" #include "kernel/split/kernel_indirect_subsurface.h" #include "kernel/split/kernel_buffer_update.h" @@ -108,12 +111,15 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..ba53ba4b26f --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,280 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, + int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, + int w) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, + int h, + int f, + int pass_stride) +{ + int x = get_global_id(0) + max(0, rect.x-filter_rect.x); + int y = get_global_id(1) + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(int w, + int h, + ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl index db65c91baf7..dcea2630aef 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_buffer_update.h" -__kernel void kernel_ocl_path_trace_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME buffer_update +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index eb34f750881..ed64ae01aae 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_direct_lighting.h" -__kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME direct_lighting +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl index 83ef5f5f3f2..8afaa686e28 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_do_volume.h" -__kernel void kernel_ocl_path_trace_do_volume( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_do_volume((KernelGlobals*)kg); -} +#define KERNEL_NAME do_volume +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl new file mode 100644 index 00000000000..e68d4104a91 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_enqueue_inactive.h" + +#define KERNEL_NAME enqueue_inactive +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index d071b39aa6f..9e1e57beba6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -18,12 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local BackgroundAOLocals locals; - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals*)kg, - &locals); -} +#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao +#define LOCALS_TYPE BackgroundAOLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl index 8c213ff5cb2..192d01444ba 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_background.h" -__kernel void kernel_ocl_path_trace_indirect_background( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_background((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_background +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl index 998ebc4c0c3..84938b889e5 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_subsurface.h" -__kernel void kernel_ocl_path_trace_indirect_subsurface( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_subsurface((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_subsurface +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 822d2287715..c314dc96c33 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_lamp_emission.h" -__kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_lamp_emission((KernelGlobals*)kg); -} +#define KERNEL_NAME lamp_emission +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d207253a40..8b1332bf013 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_next_iteration_setup.h" -__kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME next_iteration_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl index bd9aa9538c8..fa210e747c0 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_path_init.h" -__kernel void kernel_ocl_path_trace_path_init( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_path_init((KernelGlobals*)kg); -} +#define KERNEL_NAME path_init +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 9be154e3d75..68ee6f1d536 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_queue_enqueue.h" -__kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local QueueEnqueueLocals locals; - kernel_queue_enqueue((KernelGlobals*)kg, &locals); -} +#define KERNEL_NAME queue_enqueue +#define LOCALS_TYPE QueueEnqueueLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index eb4fb4d153a..10d09377ba9 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_scene_intersect.h" -__kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_scene_intersect((KernelGlobals*)kg); -} +#define KERNEL_NAME scene_intersect +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 6baee460986..40eaa561863 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -18,10 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shader_eval.h" -__kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME shader_eval +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl new file mode 100644 index 00000000000..8c36100f762 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_setup.h" + +#define KERNEL_NAME shader_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl new file mode 100644 index 00000000000..bcacaa4a054 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_sort.h" + +__attribute__((reqd_work_group_size(64, 1, 1))) +#define KERNEL_NAME shader_sort +#define LOCALS_TYPE ShaderSortLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl index 6a8ef81b32a..8de250a375c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_ao.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_ao((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_ao +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl index b255cc5ef8b..29da77022ed 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_dl.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_dl( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_dl((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_dl +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 732cda30115..651addb02f4 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -23,12 +23,15 @@ #include "kernel/kernels/opencl/kernel_do_volume.cl" #include "kernel/kernels/opencl/kernel_indirect_background.cl" #include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_setup.cl" +#include "kernel/kernels/opencl/kernel_shader_sort.cl" #include "kernel/kernels/opencl/kernel_shader_eval.cl" #include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" #include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" #include "kernel/kernels/opencl/kernel_direct_lighting.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" #include "kernel/kernels/opencl/kernel_buffer_update.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h new file mode 100644 index 00000000000..f1e914a70d4 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -0,0 +1,72 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) + +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( + ccl_global char *kg_global, + ccl_constant KernelData *data, + + ccl_global void *split_data_buffer, + ccl_global char *ray_state, + ccl_global uint *rng_state, + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel/kernel_textures.h" + + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer + ) +{ +#ifdef LOCALS_TYPE + ccl_local LOCALS_TYPE locals; +#endif + + KernelGlobals *kg = (KernelGlobals*)kg_global; + + if(ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; + + kernel_split_params.rng_state = rng_state; + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.buffer = buffer; + + split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel/kernel_textures.h" + } + + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( + kg +#ifdef LOCALS_TYPE + , &locals +#endif + ); +} + +#undef KERNEL_NAME_JOIN +#undef KERNEL_NAME_EVAL + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl index 7a1838e485f..2b3be38df84 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -18,10 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_subsurface_scatter.h" -__kernel void kernel_ocl_path_trace_subsurface_scatter( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME subsurface_scatter +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 95beea01d25..27a96720c1e 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -39,7 +39,9 @@ #include "kernel/kernel_montecarlo.h" #include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" #include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" #include "kernel/closure/bssrdf.h" CCL_NAMESPACE_BEGIN @@ -78,6 +80,7 @@ public: bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } @@ -89,6 +92,7 @@ public: bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } @@ -100,6 +104,7 @@ public: bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } @@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params() CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure) +/* Disney principled */ + +class PrincipledBSSRDFClosure : public CBSSRDFClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); + } +}; + +ClosureParam *closure_bssrdf_principled_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness), + CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure) + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index f44714c2150..14c5c1c3db5 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -60,6 +60,8 @@ #include "kernel/closure/bsdf_ashikhmin_shirley.h" #include "kernel/closure/bsdf_toon.h" #include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN @@ -154,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -162,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection) BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -176,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein) VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR) VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption) +BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness), +BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse) + +BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N), +BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen) + +/* DISNEY PRINCIPLED CLEARCOAT */ +class PrincipledClearcoatClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float clearcoat, clearcoat_roughness; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + + bsdf->ior = 1.5f; + + bsdf->alpha_x = clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + return bsdf; + } + + return NULL; + } + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_principled_clearcoat_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness), + CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure) + + /* Registration */ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare) @@ -215,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare); register_closure(ss, "microfacet_multi_ggx_aniso", id++, closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare); + register_closure(ss, "microfacet_ggx_fresnel", id++, + closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare); + register_closure(ss, "microfacet_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); register_closure(ss, "microfacet_beckmann_aniso", id++, @@ -229,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); + register_closure(ss, "principled_diffuse", id++, + bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare); + register_closure(ss, "principled_sheen", id++, + bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare); + register_closure(ss, "principled_clearcoat", id++, + closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -248,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); register_closure(ss, "bssrdf_burley", id++, closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare); + register_closure(ss, "bssrdf_principled", id++, + closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); @@ -278,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) return false; } + +/* GGX closures with Fresnel */ + +class MicrofacetFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX Glass closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure); + +class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure); + + /* Multiscattering GGX closures */ class MicrofacetMultiClosure : public CBSDFClosure { @@ -287,7 +444,7 @@ public: MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) { - /* Technically, the MultiGGX Glass closure may also transmit. However, + /* Technically, the MultiGGX closure may also transmit. However, * since this is set statically and only used for caustic flags, this * is probably as good as it gets. */ if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) { @@ -375,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params() } CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure); + +/* Multiscattering GGX closures with Fresnel */ + +class MicrofacetMultiFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure); + +class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure); + +class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {} + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure); + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index 929cf00a7e6..ff5fd9cc905 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params(); OSL::ClosureParam *closure_bssrdf_cubic_params(); OSL::ClosureParam *closure_bssrdf_gaussian_params(); OSL::ClosureParam *closure_bssrdf_burley_params(); +OSL::ClosureParam *closure_bssrdf_principled_params(); OSL::ClosureParam *closure_henyey_greenstein_volume_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_principled_clearcoat_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); void closure_background_prepare(OSL::RendererServices *, int id, void *data); @@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data); +void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data); #define CCLOSURE_PREPARE(name, classname) \ void name(RendererServices *, int id, void *data) \ diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index b767c60c617..1535496c73d 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if(sg->renderstate == NULL) + if(sg == NULL || sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index b43f8402d42..1a8ed4c884a 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -81,13 +81,15 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_principled_bsdf.osl node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS - node_texture.h node_color.h node_fresnel.h + node_ramp_util.h + node_texture.h stdosl.h oslutil.h ) diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl new file mode 100644 index 00000000000..6870d479af3 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" +#include "node_fresnel.h" + +shader node_principled_bsdf( + string distribution = "Multiscatter GGX", + color BaseColor = color(0.8, 0.8, 0.8), + float Subsurface = 0.0, + vector SubsurfaceRadius = vector(1.0, 1.0, 1.0), + color SubsurfaceColor = color(0.7, 0.1, 0.1), + float Metallic = 0.0, + float Specular = 0.5, + float SpecularTint = 0.0, + float Roughness = 0.5, + float Anisotropic = 0.0, + float AnisotropicRotation = 0.0, + float Sheen = 0.0, + float SheenTint = 0.5, + float Clearcoat = 0.0, + float ClearcoatRoughness = 0.03, + float IOR = 1.45, + float Transmission = 0.0, + float TransmissionRoughness = 0.0, + normal Normal = N, + normal ClearcoatNormal = N, + normal Tangent = normalize(dPdu), + output closure color BSDF = 0) +{ + float f = max(IOR, 1e-5); + float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0)); + float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0)); + float specular_weight = (1.0 - final_transmission); + + vector T = Tangent; + + float m_cdlum = luminance(BaseColor); + color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat + + /* rotate tangent */ + if (AnisotropicRotation != 0.0) + T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal); + + if (diffuse_weight > 1e-5) { + if (Subsurface > 1e-5) { + color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); + BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness); + } else { + BSDF = BaseColor * principled_diffuse(Normal, Roughness); + } + + if (Sheen > 1e-5) { + color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint; + + BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal); + } + + BSDF = BSDF * diffuse_weight; + } + + if (specular_weight > 1e-5) { + float aspect = sqrt(1.0 - Anisotropic * 0.9); + float r2 = Roughness * Roughness; + + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; + + color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; + + color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic; + + if (distribution == "GGX" || Roughness <= 0.075) { + BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } else { + BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } + } + + if (final_transmission > 1e-5) { + color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint); + float eta = backfacing() ? 1.0 / f : f; + + if (distribution == "GGX" || Roughness <= 5e-2) { + float cosNO = dot(Normal, I); + float Fr = fresnel_dielectric_cos(cosNO, eta); + + float refl_roughness = Roughness; + if (Roughness <= 1e-2) + refl_roughness = 0.0; + + float transmission_roughness = refl_roughness; + if (distribution == "GGX") + transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness); + + BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) + + (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta)); + } else { + BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0); + } + } + + if (Clearcoat > 1e-5) { + BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness); + } +} + diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index a8dda8a12c9..c91d2918687 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN; closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN; closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN; +closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; @@ -539,11 +544,15 @@ closure color emission() BUILTIN; closure color background() BUILTIN; closure color holdout() BUILTIN; closure color ambient_occlusion() BUILTIN; +closure color principled_diffuse(normal N, float roughness) BUILTIN; +closure color principled_sheen(normal N) BUILTIN; +closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN; // BSSRDF closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN; closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN; closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN; +closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN; // Hair closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN; diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h new file mode 100644 index 00000000000..e2762a85fc8 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +/* sets up the various state needed to do an indirect loop */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* save a copy of the state to restore later */ +#define BRANCHED_STORE(name) \ + branched_state->name = kernel_split_state.name[ray_index]; + + BRANCHED_STORE(path_state); + BRANCHED_STORE(throughput); + BRANCHED_STORE(ray); + BRANCHED_STORE(sd); + BRANCHED_STORE(isect); + BRANCHED_STORE(ray_state); + +#undef BRANCHED_STORE + + /* set loop counters to intial position */ + branched_state->next_closure = 0; + branched_state->next_sample = 0; +} + +/* ends an indirect loop and restores the previous state */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* restore state */ +#define BRANCHED_RESTORE(name) \ + kernel_split_state.name[ray_index] = branched_state->name; + + BRANCHED_RESTORE(path_state); + BRANCHED_RESTORE(throughput); + BRANCHED_RESTORE(ray); + BRANCHED_RESTORE(sd); + BRANCHED_RESTORE(isect); + BRANCHED_RESTORE(ray_state); + +#undef BRANCHED_RESTORE + + /* leave indirect loop */ + REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); +} + +ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index); + + if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { + return false; + } + +#define SPLIT_DATA_ENTRY(type, name, num) \ + kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; + SPLIT_DATA_ENTRIES_BRANCHED_SHARED +#undef SPLIT_DATA_ENTRY + + kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; + kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; + kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; + + path_radiance_init(inactive_L, kernel_data.film.use_light_pass); + inactive_L->direct_throughput = L->direct_throughput; + path_radiance_copy_indirect(inactive_L, L); + + ray_state[inactive_ray] = RAY_REGENERATED; + ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); + ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); + + atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count); + + return true; +} + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, + int ray_index, + float num_samples_adjust, + ShaderData *saved_sd, + bool reset_path_state, + bool wait_for_shared) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = saved_sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = branched_state->next_closure; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + RNG bsdf_rng = cmj_hash(rng, i); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + if(reset_path_state) { + *ps = branched_state->path_state; + } + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = throughput; + + ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; + + if(!kernel_branched_path_surface_bounce(kg, + &bsdf_rng, + sd, + sc, + j, + num_samples, + tp, + ps, + L, + bsdf_ray, + sum_sample_weight)) + { + continue; + } + + /* update state for next iteration */ + branched_state->next_closure = i; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + /* start the indirect path */ + *tp *= num_samples_inv; + + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + + return true; + } + + branched_state->next_sample = 0; + } + + branched_state->next_closure = sd->num_closure; + + if(wait_for_shared) { + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + } + + return false; +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 859c221d976..4c1fdd2d69c 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -111,24 +111,14 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum; -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, L); - } - kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L_rad); + bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); + kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); + path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 9d3d01fff75..e4545d66eff 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( unsigned int num_samples, ccl_global float *buffer) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, data_init); +#else + #ifdef __KERNEL_OPENCL__ kg->data = data; #endif @@ -105,21 +109,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( /* Initialize queue data and queue index. */ if(thread_index < queuesize) { - /* Initialize active ray queue. */ - kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize background and buffer update queue. */ - kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of AO queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of direct lighting queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + for(int i = 0; i < NUM_QUEUES; i++) { + kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } } if(thread_index == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + for(int i = 0; i < NUM_QUEUES; i++) { + Queue_index[i] = 0; + } + /* The scene-intersect kernel should not use the queues very first time. * since the queue would be empty. */ @@ -148,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( *(rng_state + index) = hash_int_2d(x, y); } } + +#endif /* KERENL_STUB */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index bdbf7387b95..3336c968a44 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; @@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ RNG rng = kernel_split_state.rng[ray_index]; + bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + # ifdef __SHADOW_TRICKS__ if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { flag = false; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - kernel_branched_path_surface_connect_light(kg, - &rng, - sd, - emission_sd, - state, - throughput, - 1.0f, - L, - 1); + enqueue_flag = 1; } # endif /* __SHADOW_TRICKS__ */ + if(flag) { /* Sample illumination from lights to find path contribution. */ float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); @@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.bsdf_eval[ray_index] = L_light; kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } @@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, #endif /* __EMISSION__ */ } -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - #ifdef __EMISSION__ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ enqueue_ray_index_local(ray_index, @@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.queue_data, kernel_split_params.queue_index); #endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h index 47d3c280831..9f8dd2392d9 100644 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -16,6 +16,100 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) + +ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &kernel_split_state.sd[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + Ray volume_ray = branched_state->ray; + volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + *ps = branched_state->path_state; + + ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; + *pray = branched_state->ray; + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = branched_state->throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(ps, j, num_samples); + + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L); + + /* indirect light bounce */ + if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) { + continue; + } + + /* start the indirect path */ + branched_state->next_closure = 0; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + /* Attempting to share too many samples is slow for volumes as it causes us to + * loop here more and have many calls to kernel_volume_integrate which evaluates + * shaders. The many expensive shader evaluations cause the work load to become + * unbalanced and many threads to become idle in this kernel. Limiting the + * number of shared samples here helps quite a lot. + */ + if(branched_state->shared_sample_count < 2) { + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + } + + return true; + } +# endif + } + + branched_state->next_sample = num_samples; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + /* todo: avoid this calculation using decoupled ray marching */ + float3 throughput = kernel_split_state.throughput[ray_index]; + kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); + kernel_split_state.throughput[ray_index] = throughput; + + return false; +} + +#endif /* __BRANCHED_PATH__ && __VOLUME__ */ ccl_device void kernel_do_volume(KernelGlobals *kg) { @@ -23,37 +117,36 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) /* We will empty this queue in this kernel. */ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; +# ifdef __BRANCHED_PATH__ + kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; +# endif /* __BRANCHED_PATH__ */ } - /* Fetch use_queues_flag. */ - char local_use_queues_flag = *kernel_split_params.use_queues_flag; - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if(local_use_queues_flag) { + + if(*kernel_split_params.use_queues_flag) { ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } } - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global char *ray_state = kernel_split_state.ray_state; - bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); - - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; RNG rng = kernel_split_state.rng[ray_index]; ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; - ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); /* Sanitize volume stack. */ if(!hit) { @@ -64,31 +157,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) Ray volume_ray = *ray; volume_ray.t = (hit)? isect->t: FLT_MAX; - bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); +# ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +# endif /* __BRANCHED_PATH__ */ + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); # ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED); - else - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER); + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_path_end(kg, ray_index); + } + } +# endif /* __VOLUME_SCATTER__ */ } -# endif + +# ifdef __BRANCHED_PATH__ } + else { + kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ } + kernel_split_state.rng[ray_index] = rng; } -#endif +# ifdef __BRANCHED_PATH__ + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_VOLUME_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __VOLUME__ */ } diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h new file mode 100644 index 00000000000..496355bbc3a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ +#ifdef __BRANCHED_PATH__ + /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + char enqueue_flag = 0; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_INACTIVE_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif /* __BRANCHED_PATH__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 9fc853a84bf..fec671be016 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with * flag RAY_SHADOW_RAY_CAST_AO */ + ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals) @@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( } ccl_barrier(CCL_LOCAL_MEM_FENCE); +#ifdef __AO__ char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; +#endif int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -122,14 +124,22 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if (state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + if(state->flag & PATH_RAY_CAMERA) { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); state->catcher_object = sd->object; if(!kernel_data.background.transparent) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); + L->shadow_background_color = indirect_background( + kg, + &kernel_split_state.sd_DL_shadow[ray_index], + state, + ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } else { @@ -155,8 +165,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); } if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } } #endif /* __HOLDOUT__ */ @@ -164,18 +173,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - buffer, - L, - sd, - sample, - state, - throughput); + +#ifdef __BRANCHED_PATH__ + if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) +#endif /* __BRANCHED_PATH__ */ + { + /* Holdout mask objects do not write data passes. */ + kernel_write_data_passes(kg, + buffer, + L, + sd, + sample, + state, + throughput); + } + /* Blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy. */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { +#ifndef __BRANCHED_PATH__ + if(kernel_data.integrator.filter_glossy != FLT_MAX) +#else + if(kernel_data.integrator.filter_glossy != FLT_MAX && + (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))) +#endif /* __BRANCHED_PATH__ */ + { float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; if(blur_pdf < 1.0f) { float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; @@ -201,85 +223,62 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ +#ifndef __BRANCHED_PATH__ float probability = path_state_terminate_probability(kg, state, throughput); +#else + float probability = 1.0f; + + if(!kernel_data.integrator.branched) { + probability = path_state_terminate_probability(kg, state, throughput); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + int num_samples = kernel_split_state.branched_state[ray_index].num_samples; + probability = path_state_terminate_probability(kg, state, throughput*num_samples); + } + else if(state->flag & PATH_RAY_TRANSPARENT) { + probability = path_state_terminate_probability(kg, state, throughput); + } +#endif if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(probability != 1.0f) { float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } else { kernel_split_state.throughput[ray_index] = throughput/probability; } } + + kernel_update_denoising_features(kg, sd, state, L); } } #ifdef __AO__ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || - (sd->flag & SD_AO)) - { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(sd->P, sd->Ng); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - _ray.time = sd->time; -#endif - _ray.dP = sd->dP; - _ray.dD = differential3_zero(); - kernel_split_state.ao_light_ray[ray_index] = _ray; - - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { + enqueue_flag = 1; } } #endif /* __AO__ */ - kernel_split_state.rng[ray_index] = rng; + kernel_split_state.rng[ray_index] = rng; #ifndef __COMPUTE_DEVICE_GPU__ } #endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - &locals->queue_atomics_bg, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - #ifdef __AO__ /* Enqueue to-shadow-ray-cast rays. */ enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, + enqueue_flag, kernel_split_params.queue_size, &locals->queue_atomics_ao, kernel_split_state.queue_data, diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h index 8192528622e..f0ebb90f60a 100644 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -23,7 +23,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); int ray_index; - if(kernel_data.integrator.ao_bounces) { + if(kernel_data.integrator.ao_bounces != INT_MAX) { ray_index = get_ray_index(kg, thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, @@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; if(state->bounce > kernel_data.integrator.ao_bounces) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } } @@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { @@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); path_radiance_accum_background(L, state, (*throughput), L_background); #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h index a56e85abeb9..82bc2f01fd7 100644 --- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - kernel_path_subsurface_accum_indirect(ss_indirect, L); +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + kernel_path_subsurface_accum_indirect(ss_indirect, L); - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if(ss_indirect->num_rays) { - kernel_path_subsurface_setup_indirect(kg, - ss_indirect, - state, - ray, - L, - throughput); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#ifdef __BRANCHED_PATH__ } +#endif #endif /* __SUBSURFACE__ */ diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 1bebc16e25b..7758e35fd32 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + } + else { + /* Update Path State */ + state->flag |= PATH_RAY_TRANSPARENT; + state->transparent_bounce++; + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ + } +} +#endif /* __BRANCHED_PATH__ */ + ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ccl_local_param unsigned int *local_queue_atomics) { @@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; } - char enqueue_flag = 0; int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -75,102 +120,127 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; ccl_global char *ray_state = kernel_split_state.ray_state; - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; - float3 _throughput = kernel_split_state.throughput[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - kernel_split_state.ao_alpha[ray_index], - kernel_split_state.ao_bsdf[ray_index], - shadow, - state->bounce); - } - else { - path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]); + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ } - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = kernel_split_state.light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; - BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; - if(update_path_radiance) { - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - kernel_split_state.is_lamp[ray_index]); + else { + kernel_split_branched_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { - path_radiance_accum_total_light(L, _throughput, &L_light); + kernel_split_branched_indirect_light_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ + + kernel_split_state.rng[ray_index] = rng; } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; + } - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } - kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; } -#endif + ccl_barrier(CCL_LOCAL_MEM_FENCE); - /* Enqueue RAY_UPDATE_BUFFER rays. */ + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), kernel_split_params.queue_size, local_queue_atomics, kernel_split_state.queue_data, kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h index e2e841f36d3..66ce2dfb6f1 100644 --- a/intern/cycles/kernel/split/kernel_queue_enqueue.h +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg, int queue_number = -1; if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) { + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; } else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 5dc94caec85..45984ca509b 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) } /* All regenerated rays become active here */ - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { +#ifdef __BRANCHED_PATH__ + if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { + kernel_split_path_end(kg, ray_index); + } + else +#endif /* __BRANCHED_PATH__ */ + { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + } + } - if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { return; + } #ifdef __KERNEL_DEBUG__ DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 0f1696e34a0..2801b32f285 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,54 +16,61 @@ CCL_NAMESPACE_BEGIN -/* This kernel sets up the ShaderData structure from the values computed +/* This kernel evaluates ShaderData structure from the values computed * by the previous kernels. - * - * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them - * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ -ccl_device void kernel_shader_eval(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) +ccl_device void kernel_shader_eval(KernelGlobals *kg) { - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + /* Sorting on cuda split is not implemented */ +#ifdef __KERNEL_CUDA__ + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; +#else + int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; +#endif + if(ray_index >= queue_index) { + return; + } ray_index = get_ray_index(kg, ray_index, +#ifdef __KERNEL_CUDA__ QUEUE_ACTIVE_AND_REGENERATED_RAYS, +#else + QUEUE_SHADER_SORTED_RAYS, +#endif kernel_split_state.queue_data, kernel_split_params.queue_size, 0); - char enqueue_flag = 0; - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { - enqueue_flag = 1; + if(ray_index == QUEUE_EMPTY_SLOT) { + return; } - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - - /* Continue on with shader evaluation. */ - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { - Intersection isect = kernel_split_state.isect[ray_index]; + ccl_global char *ray_state = kernel_split_state.ray_state; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - shader_setup_from_ray(kg, - &kernel_split_state.sd[ray_index], - &isect, - &ray); +#ifndef __BRANCHED_PATH__ float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); +#else + ShaderContext ctx = SHADER_CONTEXT_MAIN; + float rbsdf = 0.0f; + + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); + + } + + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + ctx = SHADER_CONTEXT_INDIRECT; + } + + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx); + shader_merge_closures(&kernel_split_state.sd[ray_index]); +#endif /* __BRANCHED_PATH__ */ + kernel_split_state.rng[ray_index] = rng; } } diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h new file mode 100644 index 00000000000..0432689d9fa --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_setup.h @@ -0,0 +1,70 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. + * + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + */ +ccl_device void kernel_shader_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + + shader_setup_from_ray(kg, + &kernel_split_state.sd[ray_index], + &isect, + &ray); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h new file mode 100644 index 00000000000..297decb0bc2 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_shader_sort(KernelGlobals *kg, + ccl_local_param ShaderSortLocals *locals) +{ +#ifndef __KERNEL_CUDA__ + int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(tid == 0) { + kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; + } + + uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE; + if(offset >= qsize) { + return; + } + + int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); + uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); + ccl_local uint *local_value = &locals->local_value[0]; + ccl_local ushort *local_index = &locals->local_index[0]; + + /* copy to local memory */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint add = input + idx; + uint value = (~0); + if(idx < qsize) { + int ray_index = kernel_split_state.queue_data[add]; + bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(valid) { + value = kernel_split_state.sd[ray_index].shader & SHADER_MASK; + } + } + local_value[i + lid] = value; + local_index[i + lid] = i + lid; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* skip sorting for cpu split kernel */ +# ifdef __KERNEL_OPENCL__ + + /* bitonic sort */ + for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for (uint inc = length; inc > 0; inc >>= 1) { + for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + uint i = lid + ii; + bool direction = ((i & (length << 1)) != 0); + uint j = i ^ inc; + ushort ioff = local_index[i]; + ushort joff = local_index[j]; + uint iKey = local_value[ioff]; + uint jKey = local_value[joff]; + bool smaller = (jKey < iKey) || (jKey == iKey && j < i); + bool swap = smaller ^ (j < i) ^ direction; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + local_index[i] = (swap) ? joff : ioff; + local_index[j] = (swap) ? ioff : joff; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + } + } + } +# endif /* __KERNEL_OPENCL__ */ + + /* copy to destination */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint lidx = local_index[i + lid]; + uint outi = output + idx; + uint ini = input + offset + lidx; + uint value = local_value[lidx]; + if(idx < qsize) { + kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; + } + } +#endif /* __KERNEL_CUDA__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h index 4243e18de72..474286285a9 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } - if(ray_index == QUEUE_EMPTY_SLOT) + if(ray_index == QUEUE_EMPTY_SLOT) { return; + } - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index]; - - float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd)); +#ifdef __BRANCHED_PATH__ + } + else { + kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput); } +#endif + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index bb8f0157965..78e61709b01 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -29,31 +29,82 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } +#ifdef __BRANCHED_PATH__ + /* TODO(mai): move this somewhere else? */ + if(thread_index == 0) { + /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ + kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; + } +#endif /* __BRANCHED_PATH__ */ + if(ray_index == QUEUE_EMPTY_SLOT) return; - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.light_ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + bool is_lamp = kernel_split_state.is_lamp[ray_index]; + +# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) + bool use_branched = false; + int all = 0; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + use_branched = true; + all = 1; + } +# if defined(__BRANCHED_PATH__) + else if(kernel_data.integrator.branched) { + use_branched = true; - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index]; + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + all = (kernel_data.integrator.sample_all_lights_indirect); + } + else + { + all = (kernel_data.integrator.sample_all_lights_direct); + } + } +# endif /* __BRANCHED_PATH__ */ + if(use_branched) { + kernel_branched_path_surface_connect_light(kg, + &rng, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + all); + } + else +# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ + { + /* trace shadow ray */ float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + + if(!shadow_blocked(kg, + emission_sd, + state, + &ray, + &shadow)) + { + /* accumulate */ + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); + } } + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 4303ba0a905..08f0124b529 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -37,41 +37,55 @@ #include "util/util_atomic.h" -#include "kernel/kernel_random.h" -#include "kernel/kernel_projection.h" -#include "kernel/kernel_montecarlo.h" -#include "kernel/kernel_differential.h" -#include "kernel/kernel_camera.h" - -#include "kernel/geom/geom.h" -#include "kernel/bvh/bvh.h" - -#include "kernel/kernel_accumulate.h" -#include "kernel/kernel_shader.h" -#include "kernel/kernel_light.h" -#include "kernel/kernel_passes.h" - -#ifdef __SUBSURFACE__ -# include "kernel/kernel_subsurface.h" +#include "kernel/kernel_path.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/kernel_path_branched.h" #endif -#ifdef __VOLUME__ -# include "kernel/kernel_volume.h" -#endif +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" -#include "kernel/kernel_path_state.h" -#include "kernel/kernel_shadow.h" -#include "kernel/kernel_emission.h" -#include "kernel/kernel_path_common.h" -#include "kernel/kernel_path_surface.h" -#include "kernel/kernel_path_volume.h" -#include "kernel/kernel_path_subsurface.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/split/kernel_branched.h" +#endif -#ifdef __KERNEL_DEBUG__ -# include "kernel/kernel_debug.h" +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + +#ifdef __BRANCHED_PATH__ + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { + int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; + + path_radiance_sum_indirect(L); + path_radiance_accum_sample(orig_ray_L, L, 1); + + atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } +#else + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); #endif +} -#include "kernel/kernel_queues.h" -#include "kernel/kernel_work_stealing.h" +CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 17e6587883a..eac22050a38 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -31,14 +31,6 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_ size = size SPLIT_DATA_ENTRIES; #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */ -#endif - -#ifdef __VOLUME__ - size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */ -#endif - return size; } @@ -57,16 +49,6 @@ ccl_device_inline void split_data_init(KernelGlobals *kg, SPLIT_DATA_ENTRIES; #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p; - p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); -#endif - -#ifdef __VOLUME__ - split_data->state_shadow = (ccl_global PathState*)p; - p += align_up(2 * num_elements * sizeof(PathState), 16); -#endif - split_data->ray_state = ray_state; } diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index 748197b7183..4bb2f0d3d80 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -43,6 +43,9 @@ typedef struct SplitParams { ccl_global char *use_queues_flag; ccl_global float *buffer; + + /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ + int dummy_sd_flag; } SplitParams; /* Global memory variables [porting]; These memory is used for @@ -59,7 +62,64 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(DebugData, debug_data, 1) #else # define SPLIT_DATA_DEBUG_ENTRIES -#endif +#endif /* DEBUG */ + +#ifdef __BRANCHED_PATH__ + +typedef ccl_global struct SplitBranchedState { + /* various state that must be kept and restored after an indirect loop */ + PathState path_state; + float3 throughput; + Ray ray; + + struct ShaderData sd; + Intersection isect; + + char ray_state; + + /* indirect loop state */ + int next_closure; + int next_sample; + int num_samples; + +#ifdef __SUBSURFACE__ + int ss_next_closure; + int ss_next_sample; + int next_hit; + int num_hits; + + uint lcg_state; + SubsurfaceIntersection ss_isect; + +# ifdef __VOLUME__ + VolumeStack volume_stack[VOLUME_STACK_SIZE]; +# endif /* __VOLUME__ */ +#endif /*__SUBSURFACE__ */ + + int shared_sample_count; /* number of branched samples shared with other threads */ + int original_ray; /* index of original ray when sharing branched samples */ + bool waiting_on_shared_samples; +} SplitBranchedState; + +#define SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) +#else +#define SPLIT_DATA_BRANCHED_ENTRIES +#endif /* __BRANCHED_PATH__ */ + +#ifdef __SUBSURFACE__ +# define SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) +#else +# define SPLIT_DATA_SUBSURFACE_ENTRIES +#endif /* __SUBSURFACE__ */ + +#ifdef __VOLUME__ +# define SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) +#else +# define SPLIT_DATA_VOLUME_ENTRIES +#endif /* __VOLUME__ */ #define SPLIT_DATA_ENTRIES \ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ @@ -69,9 +129,6 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ @@ -79,6 +136,28 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_DEBUG_ENTRIES \ + +/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ +#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ + SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ SPLIT_DATA_DEBUG_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ @@ -87,14 +166,6 @@ typedef struct SplitData { SPLIT_DATA_ENTRIES #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - ccl_global SubsurfaceIndirectRays *ss_rays; -#endif - -#ifdef __VOLUME__ - ccl_global PathState *state_shadow; -#endif - /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from * the host easily) but is still used the same as the other data so we have it here in this struct as well */ @@ -122,6 +193,11 @@ typedef struct BackgroundAOLocals { uint queue_atomics_ao; } BackgroundAOLocals; +typedef struct ShaderSortLocals { + uint local_value[SHADER_SORT_BLOCK_SIZE]; + ushort local_index[SHADER_SORT_BLOCK_SIZE]; +} ShaderSortLocals; + CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index 0b4d50c70ee..d5083b23f80 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -16,82 +16,306 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) -ccl_device void kernel_subsurface_scatter(KernelGlobals *kg, - ccl_local_param unsigned int* local_queue_atomics) +ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index) { -#ifdef __SUBSURFACE__ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + branched_state->ss_next_closure = 0; + branched_state->ss_next_sample = 0; + + branched_state->num_hits = 0; + branched_state->next_hit = 0; + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &branched_state->sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && + branched_state->next_closure == 0 && branched_state->next_sample == 0) + { + branched_state->lcg_state = lcg_state_init(&rng, + branched_state->path_state.rng_offset, + branched_state->path_state.sample, + 0x68bc21eb); + } + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(rng, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = branched_state->ss_next_sample; j < num_samples; j++) { + ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, + &bssrdf_rng, + &branched_state->path_state, + j, + num_samples, + PRNG_BSDF_U, + &bssrdf_u, + &bssrdf_v); + + /* intersection is expensive so avoid doing multiple times for the same input */ + if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { + RNG lcg_state = branched_state->lcg_state; + SubsurfaceIntersection ss_isect_private; + + branched_state->num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect_private, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + true); + + branched_state->lcg_state = lcg_state; + *ss_isect = ss_isect_private; + } + +#ifdef __VOLUME__ + Ray volume_ray = branched_state->ray; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { + ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index]; + *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is + * important as the indirect path will write into bssrdf_sd */ + + SubsurfaceIntersection ss_isect_private = *ss_isect; + subsurface_scatter_multi_setup(kg, + &ss_isect_private, + hit, + bssrdf_sd, + &branched_state->path_state, + branched_state->path_state.flag, + sc, + true); + *ss_isect = ss_isect_private; + + ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; + *hit_state = branched_state->path_state; + + path_state_branch(hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); + volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + + /* this next part is expensive as it does scene intersection so only do once */ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + branched_state->volume_stack[k] = hit_state->volume_stack[k]; + } + + kernel_volume_stack_update_for_subsurface(kg, + emission_sd, + &volume_ray, + branched_state->volume_stack); + } + + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state->volume_stack[k] = branched_state->volume_stack[k]; + } + } +#endif /* __VOLUME__ */ + +#ifdef __EMISSION__ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + &rng, + bssrdf_sd, + emission_sd, + hit_state, + branched_state->throughput, + num_samples_inv, + L, + all); + } + } +#endif /* __EMISSION__ */ + + /* indirect light */ + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + num_samples_inv, + bssrdf_sd, + false, + false)) + { + branched_state->ss_next_closure = i; + branched_state->ss_next_sample = j; + branched_state->next_hit = hit; + + return true; + } + + branched_state->next_closure = 0; + } + + branched_state->next_hit = 0; + } + + branched_state->ss_next_sample = 0; + } + + branched_state->ss_next_closure = sd->num_closure; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + return false; +} + +#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - - char enqueue_flag = 0; - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif + 1); + get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); +#ifdef __SUBSURFACE__ ccl_global char *ray_state = kernel_split_state.ray_state; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - ShaderData *sd = &kernel_split_state.sd[ray_index]; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + if(sd->flag & SD_BSSRDF) { - if(kernel_path_subsurface_scatter(kg, - sd, - emission_sd, - L, - state, - &rng, - ray, - throughput, - ss_indirect)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + &rng, + ray, + throughput, + ss_indirect)) + { + kernel_split_path_end(kg, ray_index); + } +#ifdef __BRANCHED_PATH__ + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + float bssrdf_probability; + ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + + /* modify throughput for picking bssrdf or bsdf */ + *throughput *= bssrdf_probability; + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + &rng, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); + subsurface_scatter_step(kg, + sd, + state, + state->flag, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); + } + } + else { + kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#endif } kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __BRANCHED_PATH__ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; } -#endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_SUBSURFACE_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ #endif /* __SUBSURFACE__ */ diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 1885e1af851..4268813b263 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -76,6 +76,345 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); switch(type) { +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_ID: { + uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset, + sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, + anisotropic_rotation_offset, transmission_roughness_offset; + uint4 data_node2 = read_node(kg, offset); + + float3 T = stack_load_float3(stack, data_node.y); + decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset); + decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset); + decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset); + + // get Disney principled parameters + float metallic = param1; + float subsurface = param2; + float specular = stack_load_float(stack, specular_offset); + float roughness = stack_load_float(stack, roughness_offset); + float specular_tint = stack_load_float(stack, specular_tint_offset); + float anisotropic = stack_load_float(stack, anisotropic_offset); + float sheen = stack_load_float(stack, sheen_offset); + float sheen_tint = stack_load_float(stack, sheen_tint_offset); + float clearcoat = stack_load_float(stack, clearcoat_offset); + float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset); + float transmission = stack_load_float(stack, transmission_offset); + float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset); + float transmission_roughness = stack_load_float(stack, transmission_roughness_offset); + float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f); + + ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + + /* rotate tangent */ + if(anisotropic_rotation != 0.0f) + T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F); + + /* calculate ior */ + float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta; + + // calculate fresnel for refraction + float cosNO = dot(N, sd->I); + float fresnel = fresnel_dielectric_cos(cosNO, ior); + + // calculate weights of the diffuse and specular part + float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission)); + + float final_transmission = saturate(transmission) * (1.0f - saturate(metallic)); + float specular_weight = (1.0f - final_transmission); + + // get the base color + uint4 data_base_color = read_node(kg, offset); + float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) : + make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w)); + + // get the additional clearcoat normal and subsurface scattering radius + uint4 data_cn_ssr = read_node(kg, offset); + float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; + float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); + + // get the subsurface color + uint4 data_subsurface_color = read_node(kg, offset); + float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : + make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w)); + + float3 weight = sd->svm_closure_weight * mix_weight; + +#ifdef __SUBSURFACE__ + float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface); + float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight; + float subsurf_sample_weight = fabsf(average(subsurf_weight)); + + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { + subsurface = 0.0f; + + /* need to set the base color in this case such that the + * rays get the correctly mixed color after transmitting + * the object */ + base_color = mixed_ss_base_color; + } + + /* diffuse */ + if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) { + if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } + else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) { + /* radius * scale */ + float3 radius = subsurface_radius * subsurface; + /* sharpness */ + float sharpness = 0.0f; + /* texture color blur */ + float texture_blur = 0.0f; + + /* create one closure per color channel */ + Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.x; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.x; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.y; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.y; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.z; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.z; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + } + } +#else + /* diffuse */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } +#endif + + /* sheen */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) { + float m_cdlum = linear_rgb_to_gray(base_color); + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat + + /* color of the sheen component */ + float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint; + + float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight; + + PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight); + + if(bsdf) { + bsdf->N = N; + + /* setup bsdf */ + sd->flag |= bsdf_principled_sheen_setup(bsdf); + } + } + + /* specular reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) { + float3 spec_weight = weight * specular_weight; + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f; + bsdf->T = T; + bsdf->extra = extra; + + float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); + float r2 = roughness * roughness; + + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; + + float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat + float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint; + + bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic; + bsdf->extra->color = base_color; + + /* setup bsdf */ + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); + else /* use multi-scatter GGX */ + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* BSDF */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(final_transmission > CLOSURE_WEIGHT_CUTOFF) { + float3 glass_weight = weight * final_transmission; + float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint); + + if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */ + float refl_roughness = roughness; + + /* reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + + bsdf->alpha_x = refl_roughness * refl_roughness; + bsdf->alpha_y = refl_roughness * refl_roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); + } + } + + /* refraction */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel)); + + if(bsdf) { + bsdf->N = N; + + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) + transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness); + else + transmission_roughness = refl_roughness; + + bsdf->alpha_x = transmission_roughness * transmission_roughness; + bsdf->alpha_y = transmission_roughness * transmission_roughness; + bsdf->ior = ior; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); + } + } + } + else { /* use multi-scatter GGX */ + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + + bsdf->alpha_x = roughness * roughness; + bsdf->alpha_y = roughness * roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); + } + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* clearcoat */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(clearcoat > CLOSURE_WEIGHT_CUTOFF) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = clearcoat_normal; + bsdf->ior = 1.5f; + bsdf->extra = extra; + + bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + break; + } +#endif /* __PRINCIPLED__ */ case CLOSURE_BSDF_DIFFUSE_ID: { float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); @@ -110,6 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; sd->flag |= bsdf_transparent_setup(bsdf); } break; @@ -344,6 +684,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __CAUSTICS_TRICKS__ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; + ATTR_FALLTHROUGH; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { float3 weight = sd->svm_closure_weight * mix_weight; @@ -370,6 +711,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; /* todo: giving a fixed weight here will cause issues when * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we @@ -383,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight); if(bsdf) { + bsdf->N = N; bsdf->roughness1 = param1; bsdf->roughness2 = param2; bsdf->offset = -stack_load_float(stack, data_node.z); diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index c94fa130af7..656357be52d 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 4a09d9f6653..cce4e89e715 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, #ifdef __UV__ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 76acc9253a1..7be03dcd65a 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,29 +16,10 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { #ifdef __KERNEL_CPU__ -# ifdef __KERNEL_SSE2__ - ssef r_ssef; - float4 &r = (float4 &)r_ssef; - r = kernel_tex_image_interp(id, x, y); -# else float4 r = kernel_tex_image_interp(id, x, y); -# endif #elif defined(__KERNEL_OPENCL__) float4 r = kernel_tex_image_interp(kg, id, x, y); #else @@ -56,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, switch(id) { case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; + case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break; + case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break; + case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break; + case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break; + case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break; case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; + case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; + case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; + case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; + case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; + case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; + case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; + case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; + case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; + case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break; + case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break; + case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break; + case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break; + case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break; + case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break; + case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break; + case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break; + case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break; + case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break; + case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break; + case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break; + case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break; + case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break; + case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break; + case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break; + case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break; + case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break; + case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break; + case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break; + case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break; + case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break; + case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break; + case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break; + case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break; + case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break; + case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break; + case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break; + case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break; + case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break; + case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break; + case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break; + case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break; + case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break; + case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break; + case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break; + case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break; + case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break; + case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break; + case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break; + case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break; + case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break; + case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break; + case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break; + case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break; + case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break; + case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break; + case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break; + case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break; + case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break; + case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break; + case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break; + case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break; + case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break; + case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break; + case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break; + case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break; + case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break; + case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break; + case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break; + case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break; + case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break; + case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break; + case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break; + case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break; default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -151,8 +132,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, # else CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); /* float4, byte4 and half4 */ - if(id < TEX_START_FLOAT_CUDA_KEPLER) + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { r = kernel_tex_image_interp_float4(tex, x, y); + } /* float, byte and half */ else { float f = kernel_tex_image_interp_float(tex, x, y); @@ -161,40 +147,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, # endif #endif -#ifdef __KERNEL_SSE2__ - float alpha = r.w; + const float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_ssef = r_ssef / ssef(alpha); - if(id >= TEX_NUM_FLOAT4_IMAGES) - r_ssef = min(r_ssef, ssef(1.0f)); - r.w = alpha; - } - - if(srgb) { - r_ssef = color_srgb_to_scene_linear(r_ssef); - r.w = alpha; - } -#else - if(use_alpha && r.w != 1.0f && r.w != 0.0f) { - float invw = 1.0f/r.w; - r.x *= invw; - r.y *= invw; - r.z *= invw; - - if(id >= TEX_NUM_FLOAT4_IMAGES) { - r.x = min(r.x, 1.0f); - r.y = min(r.y, 1.0f); - r.z = min(r.z, 1.0f); + r /= alpha; + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_BYTE) + { + r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f)); } + r.w = alpha; } if(srgb) { - r.x = color_srgb_to_scene_linear(r.x); - r.y = color_srgb_to_scene_linear(r.y); - r.z = color_srgb_to_scene_linear(r.z); + r = color_srgb_to_scene_linear_v4(r); } -#endif return r; } @@ -336,8 +304,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 47209ddfbab..d859cae1708 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -397,17 +397,23 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, + CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID, + CLOSURE_BSDF_PRINCIPLED_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, + CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, + CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, @@ -416,24 +422,26 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, /* Special cases */ CLOSURE_BSDF_BSSRDF_ID, + CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID, CLOSURE_BSDF_TRANSPARENT_ID, /* BSSRDF */ CLOSURE_BSSRDF_CUBIC_ID, CLOSURE_BSSRDF_GAUSSIAN_ID, + CLOSURE_BSSRDF_PRINCIPLED_ID, CLOSURE_BSSRDF_BURLEY_ID, /* Other */ @@ -447,19 +455,24 @@ typedef enum ClosureType { CLOSURE_VOLUME_ABSORPTION_ID, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, + CLOSURE_BSDF_PRINCIPLED_ID, + NBUILTIN_CLOSURES } ClosureType; /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) -#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ - type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) + type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) @@ -468,7 +481,8 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index 9e826c8c23f..f4a5b2b2994 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -46,8 +46,13 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, # if defined(__KERNEL_CUDA__) # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - if(id < TEX_START_HALF4_CUDA_KEPLER) + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); + } else { float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); r = make_float4(f, f, f, 1.0f); |