diff options
Diffstat (limited to 'intern/cycles/kernel')
211 files changed, 16247 insertions, 9172 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 5322f6abee1..7aab5f4a94a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -1,31 +1,64 @@ remove_extra_strict_flags() set(INC - . - ../util - osl - svm + .. ) set(INC_SYS ) -set(SRC +set(SRC_CPU_KERNELS kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp + kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp +) + +set(SRC_CUDA_KERNELS + kernels/cuda/kernel.cu + kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu +) + +set(SRC_OPENCL_KERNELS kernels/opencl/kernel.cl + kernels/opencl/kernel_state_buffer_size.cl + kernels/opencl/kernel_split.cl kernels/opencl/kernel_data_init.cl + kernels/opencl/kernel_path_init.cl kernels/opencl/kernel_queue_enqueue.cl kernels/opencl/kernel_scene_intersect.cl kernels/opencl/kernel_lamp_emission.cl - kernels/opencl/kernel_background_buffer_update.cl + kernels/opencl/kernel_do_volume.cl + kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl + kernels/opencl/kernel_subsurface_scatter.cl kernels/opencl/kernel_direct_lighting.cl - kernels/opencl/kernel_shadow_blocked.cl + kernels/opencl/kernel_shadow_blocked_ao.cl + kernels/opencl/kernel_shadow_blocked_dl.cl + kernels/opencl/kernel_enqueue_inactive.cl kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_sum_all_radiance.cl - kernels/cuda/kernel.cu + kernels/opencl/kernel_indirect_subsurface.cl + kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl ) set(SRC_BVH_HEADERS @@ -52,12 +85,10 @@ set(SRC_HEADERS kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h - kernel_debug.h kernel_differential.h kernel_emission.h kernel_film.h kernel_globals.h - kernel_image_opencl.h kernel_jitter.h kernel_light.h kernel_math.h @@ -68,6 +99,7 @@ set(SRC_HEADERS kernel_path_common.h kernel_path_state.h kernel_path_surface.h + kernel_path_subsurface.h kernel_path_volume.h kernel_projection.h kernel_queues.h @@ -86,6 +118,18 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h +) + +set(SRC_KERNELS_CUDA_HEADERS + kernels/cuda/kernel_config.h + kernels/cuda/kernel_cuda_image.h +) + +set(SRC_KERNELS_OPENCL_HEADERS + kernels/opencl/kernel_split_function.h + kernels/opencl/kernel_opencl_image.h ) set(SRC_CLOSURE_HEADERS @@ -109,6 +153,8 @@ set(SRC_CLOSURE_HEADERS closure/bssrdf.h closure/emissive.h closure/volume.h + closure/bsdf_principled_diffuse.h + closure/bsdf_principled_sheen.h ) set(SRC_SVM_HEADERS @@ -162,8 +208,11 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_curve.h + geom/geom_curve_intersect.h geom/geom_motion_curve.h geom/geom_motion_triangle.h + geom/geom_motion_triangle_intersect.h + geom/geom_motion_triangle_shader.h geom/geom_object.h geom/geom_patch.h geom/geom_primitive.h @@ -173,31 +222,93 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h + ../util/util_defines.h ../util/util_half.h ../util/util_hash.h ../util/util_math.h ../util/util_math_fast.h + ../util/util_math_intersect.h + ../util/util_math_float2.h + ../util/util_math_float3.h + ../util/util_math_float4.h + ../util/util_math_int2.h + ../util/util_math_int3.h + ../util/util_math_int4.h + ../util/util_math_matrix.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h ../util/util_types.h + ../util/util_types_float2.h + ../util/util_types_float2_impl.h + ../util/util_types_float3.h + ../util/util_types_float3_impl.h + ../util/util_types_float4.h + ../util/util_types_float4_impl.h + ../util/util_types_int2.h + ../util/util_types_int2_impl.h + ../util/util_types_int3.h + ../util/util_types_int3_impl.h + ../util/util_types_int4.h + ../util/util_types_int4_impl.h + ../util/util_types_uchar2.h + ../util/util_types_uchar2_impl.h + ../util/util_types_uchar3.h + ../util/util_types_uchar3_impl.h + ../util/util_types_uchar4.h + ../util/util_types_uchar4_impl.h + ../util/util_types_uint2.h + ../util/util_types_uint2_impl.h + ../util/util_types_uint3.h + ../util/util_types_uint3_impl.h + ../util/util_types_uint4.h + ../util/util_types_uint4_impl.h + ../util/util_types_vector3.h + ../util/util_types_vector3_impl.h ) set(SRC_SPLIT_HEADERS - split/kernel_background_buffer_update.h + split/kernel_branched.h + split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h + split/kernel_do_volume.h + split/kernel_enqueue_inactive.h split/kernel_holdout_emission_blurring_pathtermination_ao.h + split/kernel_indirect_background.h + split/kernel_indirect_subsurface.h split/kernel_lamp_emission.h split/kernel_next_iteration_setup.h + split/kernel_path_init.h + split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h - split/kernel_shadow_blocked.h + split/kernel_shadow_blocked_ao.h + split/kernel_shadow_blocked_dl.h split/kernel_split_common.h - split/kernel_sum_all_radiance.h + split/kernel_split_data.h + split/kernel_split_data_types.h + split/kernel_subsurface_scatter.h ) # CUDA module @@ -217,7 +328,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "80") + if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " @@ -225,25 +336,31 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu + set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch experimental) + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") - set(cuda_cubin kernel_experimental_${arch}.cubin) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel_${arch}.cubin) + set(flags ${flags} -D__KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() + set(cuda_cubin ${name}_${arch}.cubin) + if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") else() @@ -256,26 +373,27 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") + set(cuda_kernel_src "/kernels/cuda/${name}.cu") + add_custom_command( OUTPUT ${cuda_cubin} COMMAND ${cuda_nvcc_command} -arch=${arch} ${CUDA_NVCC_FLAGS} -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu + --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} - ${cuda_extra_flags} + ${flags} ${cuda_debug_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm + -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC - DEPENDS ${cuda_sources}) + DEPENDS ${sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) @@ -288,8 +406,18 @@ if(WITH_CYCLES_CUDA_BINARIES) endmacro() foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) - # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + if(CUDA_VERSION MATCHES "90" AND ${arch} MATCHES "sm_2.") + message(STATUS "CUDA binaries for ${arch} disabled, not supported by CUDA 9.") + else() + # Compile regular kernel + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) + + if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) + # Compile split kernel + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE) + endif() + endif() endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -319,38 +447,45 @@ list(APPEND SRC_HEADERS include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - ) +set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +if(CXX_HAS_SSE) set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel - ${SRC} + ${SRC_CPU_KERNELS} + ${SRC_CUDA_KERNELS} + ${SRC_OPENCL_KERNELS} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -370,24 +505,16 @@ endif() #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 36798982653..cf0c8542d69 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -27,43 +27,43 @@ CCL_NAMESPACE_BEGIN -#include "bvh_types.h" +#include "kernel/bvh/bvh_types.h" /* Common QBVH functions. */ #ifdef __QBVH__ -# include "qbvh_nodes.h" +# include "kernel/bvh/qbvh_nodes.h" #endif /* Regular BVH traversal */ -#include "bvh_nodes.h" +#include "kernel/bvh/bvh_nodes.h" #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 -#include "bvh_traversal.h" +#include "kernel/bvh/bvh_traversal.h" #if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif /* Subsurface scattering BVH traversal */ @@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN #if defined(__SUBSURFACE__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_subsurface.h" # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion # define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_subsurface.h" # endif #endif /* __SUBSURFACE__ */ @@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif #endif /* __VOLUME__ */ @@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN #if defined(__SHADOW_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all # define BVH_FUNCTION_FEATURES 0 -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif #endif /* __SHADOW_RECORD_ALL__ */ @@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif #endif /* __VOLUME_RECORD_ALL__ */ @@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, } #ifdef __SUBSURFACE__ +/* Note: ray is passed by value to work around a possible CUDA compiler bug. */ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, - const Ray *ray, + const Ray ray, SubsurfaceIntersection *ss_isect, int subsurface_object, uint *lcg_state, @@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_subsurface_motion(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, @@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, } #endif /* __OBJECT_MOTION__ */ return bvh_intersect_subsurface(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, @@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, #endif #ifdef __SHADOW_RECORD_ALL__ -ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + uint visibility, + uint max_hits, + uint *num_hits) { # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair_motion(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all_motion(kg, + ray, + isect, + visibility, + max_hits, + num_hits); } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __HAIR__ */ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_instancing) { + return bvh_intersect_shadow_all_instancing(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __INSTANCING__ */ - return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all(kg, + ray, + isect, + visibility, + max_hits, + num_hits); } #endif /* __SHADOW_RECORD_ALL__ */ @@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } -#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__) +#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__)) /* ToDo: Move to another file? */ ccl_device int intersections_compare(const void *a, const void *b) { @@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b) } #endif -CCL_NAMESPACE_END +#if defined(__SHADOW_RECORD_ALL__) +ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits) +{ +#ifdef __KERNEL_GPU__ + /* Use bubble sort which has more friendly memory pattern on GPU. */ + bool swapped; + do { + swapped = false; + for(int j = 0; j < num_hits - 1; ++j) { + if(hits[j].t > hits[j + 1].t) { + struct Intersection tmp = hits[j]; + hits[j] = hits[j + 1]; + hits[j + 1] = tmp; + swapped = true; + } + } + --num_hits; + } while(swapped); +#else + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); +#endif +} +#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 726bef1794c..6c33dad5426 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -17,8 +17,8 @@ // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and // 3-vector which might be faster. ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, - int node_addr, - int child) + int node_addr, + int child) { Transform space; const int child_addr = node_addr + child * 3; @@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k #if !defined(__KERNEL_SSE2__) ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); if(difl != 0.0f) { float hdiff = 1.0f + difl; @@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask = tnear <= tfar; dist[0] = tnear.f[0]; dist[1] = tnear.f[1]; @@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const float difl, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const float difl, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask; if(difl != 0.0f) { const float round_down = 1.0f - difl; @@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 294362ea995..a6a4353562c 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_shadow_all.h" +# include "kernel/bvh/qbvh_shadow_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -45,6 +45,7 @@ ccl_device_inline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -121,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idir, isect_t, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, @@ -136,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idirsplat, shufflexyz, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #endif // __KERNEL_SSE2__ @@ -187,8 +185,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* primitive intersection */ while(prim_addr < prim_addr2) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); bool hit; /* todo: specialized intersect functions which don't fill in @@ -198,10 +195,10 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, - PATH_RAY_SHADOW, + dir, + visibility, object, prim_addr); break; @@ -213,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -222,31 +219,32 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -308,12 +306,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -353,22 +350,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); - /* scale isect->t to adjust for instancing */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; @@ -399,6 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -407,6 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, + visibility, max_hits, num_hits); } @@ -417,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, + visibility, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index d9623c94b2e..bda7e34907a 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_subsurface.h" +# include "kernel/bvh/qbvh_subsurface.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -72,19 +72,19 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ss_isect->num_hits = 0; const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object); - if(!(object_flag & SD_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + subsurface_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t); #endif object = subsurface_object; } @@ -109,9 +109,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -197,9 +194,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, - &isect_precalc, ss_isect, P, + dir, object, prim_addr, isect_t, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index a0e478e972b..ae8f54821f2 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_traversal.h" +# include "kernel/bvh/qbvh_traversal.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -213,7 +210,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, --stack_ptr; } } - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_NODE(); } /* if node is leaf, fetch triangle list */ @@ -235,26 +232,26 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif #else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; #endif } @@ -264,7 +261,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(motion_triangle_intersect(kg, isect, @@ -277,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -296,48 +293,49 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -353,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -390,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index c3abe2e157d..ead424aaaaf 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -50,12 +50,17 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_DEBUG__ # define BVH_DEBUG_INIT() \ do { \ - isect->num_traversal_steps = 0; \ + isect->num_traversed_nodes = 0; \ isect->num_traversed_instances = 0; \ + isect->num_intersections = 0; \ } while(0) -# define BVH_DEBUG_NEXT_STEP() \ +# define BVH_DEBUG_NEXT_NODE() \ do { \ - ++isect->num_traversal_steps; \ + ++isect->num_traversed_nodes; \ + } while(0) +# define BVH_DEBUG_NEXT_INTERSECTION() \ + do { \ + ++isect->num_intersections; \ } while(0) # define BVH_DEBUG_NEXT_INSTANCE() \ do { \ @@ -63,7 +68,8 @@ CCL_NAMESPACE_BEGIN } while(0) #else /* __KERNEL_DEBUG__ */ # define BVH_DEBUG_INIT() -# define BVH_DEBUG_NEXT_STEP() +# define BVH_DEBUG_NEXT_NODE() +# define BVH_DEBUG_NEXT_INTERSECTION() # define BVH_DEBUG_NEXT_INSTANCE() #endif /* __KERNEL_DEBUG__ */ diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index f6db399080b..42e626c8e19 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume.h" +# include "kernel/bvh/qbvh_volume.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - #if 1 /* try to intersect with VDB volumes */ int num_volumes = kernel_data.tables.num_volumes; @@ -212,9 +209,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr); @@ -254,17 +251,13 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); @@ -301,13 +294,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 04fe6e02b15..8c94512a0b9 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume_all.h" +# include "kernel/bvh/qbvh_volume_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -128,9 +128,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, } #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -226,9 +223,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, + dir, visibility, object, prim_addr); @@ -314,16 +311,13 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -369,20 +363,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); /* Scale isect->t to adjust for instancing. */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 6d22f0b0d6a..3036efd4198 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif @@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust( const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index 5f4d06f12ea..522213f30ca 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -33,6 +33,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -96,24 +97,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; + if(false #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) { + || ((__float_as_uint(inodes.x) & visibility) == 0) +#endif +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif + ) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; continue; } -#endif ssef dist; int child_mask = NODE_INTERSECT(kg, @@ -239,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(node_addr < 0) { float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + if((__float_as_uint(leaf.z) & visibility) == 0) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; @@ -262,8 +267,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Primitive intersection. */ while(prim_addr < prim_addr2) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); bool hit; /* todo: specialized intersect functions which don't fill in @@ -273,10 +277,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, - PATH_RAY_SHADOW, + dir, + visibility, object, prim_addr); break; @@ -288,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -297,31 +301,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -383,9 +388,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif num_hits_in_instance = 0; @@ -407,8 +412,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -438,11 +441,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -465,8 +467,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h index ccd36df034a..be7658d11d7 100644 --- a/intern/cycles/kernel/bvh/qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h @@ -61,19 +61,19 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, ss_isect->num_hits = 0; const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object); - if(!(object_flag & SD_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + subsurface_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t); #endif object = subsurface_object; } @@ -105,9 +105,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -253,9 +250,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, - &isect_precalc, ss_isect, P, + dir, object, prim_addr, isect_t, diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index f2d8e558dcc..335a4afd47a 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -106,20 +106,23 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; if(UNLIKELY(node_dist > isect->t) +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif #ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0) + || (__float_as_uint(inodes.x) & visibility) == 0 #endif + ) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; @@ -131,7 +134,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int child_mask; ssef dist; - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_NODE(); #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { @@ -326,18 +329,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -347,7 +350,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); + BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(motion_triangle_intersect(kg, isect, @@ -359,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -371,41 +374,42 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -442,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -463,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -484,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; node_dist = traversal_stack[stack_ptr].dist; diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index 989873b549b..bcda7bbd251 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - #if 1 /* try to intersect with VDB volumes */ int num_volumes = kernel_data.tables.num_volumes; @@ -284,7 +281,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr); + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); } break; } @@ -311,13 +308,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance push. */ object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -336,8 +331,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -361,9 +354,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -382,8 +375,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index 87bbca5d85c..26f31c379c3 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - #if 1 /* try to intersect with VDB volumes */ int num_volumes = kernel_data.tables.num_volumes; @@ -298,7 +295,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr); + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); if(hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -371,13 +368,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance push. */ object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif qbvh_near_far_idx_calc(idir, @@ -396,7 +391,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -435,11 +429,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -462,8 +455,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index b7abc1ec507..e799855a65e 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty { kernel_assert(size <= sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra); + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra; if(num_closure + num_closure_extra >= MAX_CLOSURE) return NULL; - ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure]; + ShaderClosure *sc = &sd->closure[num_closure]; sc->type = type; sc->weight = weight; - ccl_fetch(sd, num_closure)++; + sd->num_closure++; return sc; } @@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size) * This lets us keep the same fast array iteration over closures, as we * found linked list iteration and iteration with skipping to be slower. */ int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra; + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra + num_extra; if(num_closure + num_closure_extra > MAX_CLOSURE) { /* Remove previous closure. */ - ccl_fetch(sd, num_closure)--; - ccl_fetch(sd, num_closure_extra)++; + sd->num_closure--; + sd->num_closure_extra++; return NULL; } - ccl_fetch(sd, num_closure_extra) = num_closure_extra; - return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra); + sd->num_closure_extra = num_closure_extra; + return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra); } ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight) { ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight); - if(!sc) + if(sc == NULL) return NULL; float sample_weight = fabsf(average(weight)); diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 7e4d5fe2e37..86a00d2124d 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -14,126 +14,144 @@ * limitations under the License. */ -#include "../closure/bsdf_ashikhmin_velvet.h" -#include "../closure/bsdf_diffuse.h" -#include "../closure/bsdf_oren_nayar.h" -#include "../closure/bsdf_phong_ramp.h" -#include "../closure/bsdf_diffuse_ramp.h" -#include "../closure/bsdf_microfacet.h" -#include "../closure/bsdf_microfacet_multi.h" -#include "../closure/bsdf_reflection.h" -#include "../closure/bsdf_refraction.h" -#include "../closure/bsdf_transparent.h" -#include "../closure/bsdf_ashikhmin_shirley.h" -#include "../closure/bsdf_toon.h" -#include "../closure/bsdf_hair.h" +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_phong_ramp.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #ifdef __SUBSURFACE__ -# include "../closure/bssrdf.h" +# include "kernel/closure/bssrdf.h" #endif #ifdef __VOLUME__ -# include "../closure/volume.h" +# include "kernel/closure/volume.h" #endif CCL_NAMESPACE_BEGIN ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - const ShaderClosure *sc, - float randu, - float randv, - float3 *eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -157,75 +175,89 @@ float3 bsdf_eval(KernelGlobals *kg, { float3 eval; - if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { + if(dot(sd->Ng, omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -237,63 +269,77 @@ float3 bsdf_eval(KernelGlobals *kg, switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) #ifdef __SVM__ switch(sc->type) { case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: bsdf_microfacet_multi_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; @@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: @@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: return bsdf_hair_merge(a, b); +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + return bsdf_principled_diffuse_merge(a, b); +#endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: return volume_henyey_greenstein_merge(a, b); @@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) #endif } +/* Classifies a closure as diffuse-like or specular-like. + * This is needed for the denoising feature pass generation, + * which are written on the first bounce where more than 25% + * of the sampling weight belongs to diffuse-line closures. */ +ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + return true; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc; + return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f); + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 1cd8246aa71..b6c896c754b 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -143,6 +143,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; float3 N = bsdf->N; + int label = LABEL_REFLECT | LABEL_GLOSSY; float NdotI = dot(N, I); if(NdotI > 0.0f) { @@ -211,6 +212,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, /* Some high number for MIS. */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + label = LABEL_REFLECT | LABEL_SINGULAR; } else { /* leave the rest to eval_reflect */ @@ -224,7 +226,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, #endif } - return LABEL_REFLECT|LABEL_GLOSSY; + return label; } diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h index bede5f45e7e..daaa26dc6ad 100644 --- a/intern/cycles/kernel/closure/bsdf_hair.h +++ b/intern/cycles/kernel/closure/bsdf_hair.h @@ -267,7 +267,10 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, *eval = make_float3(*pdf, *pdf, *pdf); - kernel_assert(dot(locy, *omega_in) < 0.0f); + /* TODO(sergey): Should always be negative, but seems some precision issue + * is involved here. + */ + kernel_assert(dot(locy, *omega_in) < 1e-4f); return LABEL_TRANSMIT|LABEL_GLOSSY; } diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 0a8d14a00c2..a780bd0cf28 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -36,7 +36,8 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct MicrofacetExtra { - float3 color; + float3 color, cspec0; + float clearcoat; } MicrofacetExtra; typedef ccl_addr_space struct MicrofacetBsdf { @@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ @@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched( return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } +/* Calculate the reflection color + * + * If fresnel is used, the color is an interpolation of the F0 color and white + * with respect to the fresnel + * + * Else it is simply white + */ +ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) { + float3 F = make_float3(1.0f, 1.0f, 1.0f); + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + if(use_fresnel) { + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + + F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0); + } + + return F; +} + +ccl_device_forceinline float D_GTR1(float NdotH, float alpha) +{ + if(alpha >= 1.0f) return M_1_PI_F; + float alpha2 = alpha*alpha; + float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH; + return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t); +} + /* GGX microfacet with Smith shadow-masking from: * * Microfacet Models for Refraction through Rough Surfaces @@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched( ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b) { const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a; @@ -266,23 +334,45 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->ior == bsdf_b->ior) && - ((!bsdf_a->extra && !bsdf_b->extra) || + ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) || ((bsdf_a->extra && bsdf_b->extra) && (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)))); } ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = saturate(bsdf->alpha_y); - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = saturate(bsdf->alpha_y); + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float alpha2 = alpha_x * alpha_y; float D, G1o, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic * eq. 20: (F*G*D)/(4*in*on) @@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } /* eq. 34: now calculate G1(i,m) and G1(o,m) */ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); @@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons /* eq. 20 */ float common = D * 0.25f / cosNO; - float out = G * common; + + float3 F = reflection_color(bsdf, omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + float3 out = F * G * common; /* eq. 2 in distribution of visible normals sampling * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ @@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons * pdf = pm * 0.25 / dot(m, I); */ *pdf = G1o * common; - return make_float3(out, out, out); + return out; } return make_float3(0.0f, 0.0f, 0.0f); @@ -452,6 +561,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure float alpha_y = bsdf->alpha_y; bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = bsdf->N; + int label; float cosNO = dot(N, I); if(cosNO > 0) { @@ -477,6 +587,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* reflection or refraction? */ if(!m_refractive) { float cosMO = dot(m, I); + label = LABEL_REFLECT | LABEL_GLOSSY; if(cosMO > 0) { /* eq. 39 - compute actual reflected direction */ @@ -487,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + /* if fresnel is used, calculate the color with reflection_color(...) */ + if(use_fresnel) { + *eval *= reflection_color(bsdf, *omega_in, m); + } + + label = LABEL_REFLECT | LABEL_SINGULAR; } else { /* microfacet normal is visible to this ray */ @@ -494,16 +616,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure float alpha2 = alpha_x * alpha_y; float D, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic */ float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = 1/(cosThetaM2) - 1; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + + /* recalculate G1o */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } + /* eq. 34: now calculate G1(i,m) */ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); } @@ -535,10 +673,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* see eval function for derivation */ float common = (G1o * D) * 0.25f / cosNO; - float out = G1i * common; *pdf = common; - *eval = make_float3(out, out, out); + float3 F = reflection_color(bsdf, *omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + *eval = G1i * common * F; } #ifdef __RAY_DIFFERENTIALS__ @@ -549,6 +691,8 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure } } else { + label = LABEL_TRANSMIT | LABEL_GLOSSY; + /* CAUTION: the i and o variables are inverted relative to the paper * eq. 39 - compute actual refractive direction */ float3 R, T; @@ -576,6 +720,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + label = LABEL_TRANSMIT | LABEL_SINGULAR; } else { /* eq. 33 */ @@ -607,7 +752,10 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure } } } - return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; + else { + label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; + } + return label; } /* Beckmann microfacet with Smith shadow-masking from: @@ -815,6 +963,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl float alpha_y = bsdf->alpha_y; bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = bsdf->N; + int label; float cosNO = dot(N, I); if(cosNO > 0) { @@ -839,6 +988,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl /* reflection or refraction? */ if(!m_refractive) { + label = LABEL_REFLECT | LABEL_GLOSSY; float cosMO = dot(m, I); if(cosMO > 0) { @@ -850,6 +1000,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + label = LABEL_REFLECT | LABEL_SINGULAR; } else { /* microfacet normal is visible to this ray @@ -904,6 +1055,8 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl } } else { + label = LABEL_TRANSMIT | LABEL_GLOSSY; + /* CAUTION: the i and o variables are inverted relative to the paper * eq. 39 - compute actual refractive direction */ float3 R, T; @@ -931,6 +1084,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + label = LABEL_TRANSMIT | LABEL_SINGULAR; } else { /* eq. 33 */ @@ -963,7 +1117,10 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl } } } - return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; + else { + label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; + } + return label; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index cea59adfebe..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { - if(cosI > 0.9999f || cosI < 1e-6f) { - const float r = sqrtf(randU.x / (1.0f - randU.x)); - const float phi = M_2PI_F * randU.y; + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,26 +64,26 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); - const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y)); + const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y); @@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } -/* === Phase functions: Glossy, Diffuse and Glass === */ +/* === Phase functions: Glossy and Glass === */ -/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */ -ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) +/* Phase function for reflective materials. */ +ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm) { - if(n && k) - *weight *= fresnel_conductor(dot(wi, wm), *n, *k); - return -wi + 2.0f * wm * dot(wi, wm); } -ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) +ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l else phase *= D_ggx_aniso(wh, alpha); - if(n && k) { - /* Apply conductive fresnel term. */ - return phase * fresnel_conductor(dotW_WH, *n, *k); - } - return make_float3(phase, phase, phase); } -/* Phase function for rough lambertian diffuse surfaces. */ -ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) -{ - float3 tm, bm; - make_orthonormals(wm, &tm, &bm); - - float2 disk = concentric_sample_disk(randu, randv); - return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm; -} - -ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) -{ - const float v = max(0.0f, dot(w, wm)) * M_1_PI_F; - return make_float3(v, v, v); -} - /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */ ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) { @@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r) return saturate(albedo); } +ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior) +{ + if(ior < 1.0f) { + ior = 1.0f/ior; + } + a = saturate(a); + ior = clamp(ior, 1.0f, 3.0f); + float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f; + float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f; + float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior); + float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f; + + return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f); +} + ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + float albedo = mf_ggx_albedo(alpha); - return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { - return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; -} + float D = D_ggx_aniso(normalize(wi+wo), alpha); + float lambda = mf_lambda(wi, alpha); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); -ccl_device_forceinline float mf_diffuse_pdf(const float3 wo) -{ - return M_1_PI_F * wo.z; + float multiscatter = wo.z * M_1_PI_F; + + float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y)); + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { - float3 wh; - float fresnel; - if(wi.z*wo.z > 0.0f) { - wh = normalize(wi + wo); - fresnel = fresnel_dielectric_cos(dot(wi, wh), eta); - } - else { - wh = normalize(wi + wo*eta); - fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta); - } + bool reflective = (wi.z*wo.z > 0.0f); + + float wh_len; + float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len); if(wh.z < 0.0f) wh = -wh; float3 r_wi = (wi.z < 0.0f)? -wi: wi; - return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z); + float lambda = mf_lambda(r_wi, make_float2(alpha, alpha)); + float D = D_ggx(wh, alpha); + float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta); + + float multiscatter = fabsf(wo.z * M_1_PI_F); + if(reflective) { + float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f); + float albedo = mf_ggx_albedo(alpha); + return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } + else { + float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f)); + float albedo = mf_ggx_transmission_albedo(alpha, eta); + return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } } /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */ @@ -313,18 +318,11 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons #define MF_PHASE_FUNCTION glass #define MF_MULTI_GLASS -#include "bsdf_microfacet_multi_impl.h" - -/* The diffuse phase function is not implemented as a node yet. */ -#if 0 -#define MF_PHASE_FUNCTION diffuse -#define MF_MULTI_DIFFUSE -#include "bsdf_microfacet_multi_impl.h" -#endif +#include "kernel/closure/bsdf_microfacet_multi_impl.h" #define MF_PHASE_FUNCTION glossy #define MF_MULTI_GLOSSY -#include "bsdf_microfacet_multi_impl.h" +#include "kernel/closure/bsdf_microfacet_multi_impl.h" ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness) { @@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf) bsdf->extra->color.x = saturate(bsdf->extra->color.x); bsdf->extra->color.y = saturate(bsdf->extra->color.y); bsdf->extra->color.z = saturate(bsdf->extra->color.z); - - bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } @@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf) if(is_zero(bsdf->T)) bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + if(is_zero(bsdf->T)) + bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf) { bsdf->alpha_y = bsdf->alpha_x; + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); float3 X, Y, Z; Z = bsdf->N; @@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else *pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x); - return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *omega_in = 2*dot(Z, I)*Z - I; *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); +#ifdef __RAY_DIFFERENTIALS__ + *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; + *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; +#endif return LABEL_REFLECT|LABEL_SINGULAR; } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); if(is_aniso) make_orthonormals_tangent(Z, bsdf->T, &X, &Y); @@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); if(is_aniso) *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else @@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; @@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } +ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f); + bsdf->alpha_y = bsdf->alpha_x; + bsdf->ior = max(0.0f, bsdf->ior); + bsdf->extra->color.x = saturate(bsdf->extra->color.x); + bsdf->extra->color.y = saturate(bsdf->extra->color.y); + bsdf->extra->color.z = saturate(bsdf->extra->color.z); + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; +} + ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; @@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color); } ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { @@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + float3 X, Y, Z; Z = bsdf->N; make_orthonormals(Z, &X, &Y); @@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S } } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + make_orthonormals(Z, &X, &Y); float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); *eval *= *pdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 8054fa8e849..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -26,19 +26,16 @@ * the balance heuristic isn't necessarily optimal anymore. */ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( - float3 wi, - float3 wo, - const bool wo_outside, - const float3 color, - const float alpha_x, - const float alpha_y, - ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { /* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */ bool swapped = false; @@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Analytically compute single scattering for lower noise. */ float3 eval; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + const float3 wh = normalize(wi+wo); #ifdef MF_MULTI_GLASS eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta); if(wo_outside) eval *= -lambda_r / (shadowing_lambda - lambda_r); else eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f); -#elif defined(MF_MULTI_DIFFUSE) - /* Diffuse has no special closed form for the single scattering bounce */ - eval = make_float3(0.0f, 0.0f, 0.0f); #else /* MF_MULTI_GLOSSY */ - const float3 wh = normalize(wi+wo); const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda); float val = G2 * 0.25f / wi.z; if(alpha.x == alpha.y) val *= D_ggx(wh, alpha.x); else val *= D_ggx_aniso(wh, alpha); - if(n && k) { - eval = fresnel_conductor(dot(wh, wi), *n, *k) * val; - } - else { - eval = make_float3(val, val, val); - } + eval = make_float3(val, val, val); #endif + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0); + + eval *= throughput; + } + float3 wr = -wi; float hr = 1.0f; float C1_r = 1.0f; float G1_r = 0.0f; bool outside = true; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); - -#ifdef MF_MULTI_DIFFUSE - if(order == 0) { - /* Compute single-scattering for diffuse. */ - const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r); - eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); + +#ifdef MF_MULTI_GLASS + if(order == 0 && use_fresnel) { + /* Evaluate amount of scattering towards wo on this microfacet. */ + float3 phase; + if(outside) + phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); + else + phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta); + + eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda); } #endif if(order > 0) { @@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); else phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta); -#elif defined(MF_MULTI_DIFFUSE) - phase = mf_eval_phase_diffuse(wo, wm); #else /* MF_MULTI_GLOSSY */ - phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput; + phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput; #endif eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda); } @@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; hr = -hr; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel && !next_outside) { + throughput *= color; + } + else if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif lambda_r = mf_lambda(wr, alpha); - throughput *= color; + if(!use_fresnel) + throughput *= color; C1_r = mf_C1(hr); G1_r = mf_G1(wr, C1_r, lambda_r); @@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( * escaped the surface in wo. The function returns the throughput between wi and wo. * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal. */ -ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( + float3 wi, + float3 *wo, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { const float2 alpha = make_float2(alpha_x, alpha_y); @@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 float G1_r = 0.0f; bool outside = true; + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0); + } + int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ - if(order > 0) + if(!use_fresnel && order > 0) throughput *= color; /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; outside = !outside; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel) { + if(!next_outside) { + throughput *= color; + } + else { + float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel) { + float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif /* Update random walk parameters. */ @@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 } #undef MF_MULTI_GLASS -#undef MF_MULTI_DIFFUSE #undef MF_MULTI_GLOSSY #undef MF_PHASE_FUNCTION diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h new file mode 100644 index 00000000000..f8ca64293b0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__ +#define __BSDF_PRINCIPLED_DIFFUSE_H__ + +/* DISNEY PRINCIPLED DIFFUSE BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledDiffuseBsdf { + SHADER_CLOSURE_BASE; + + float roughness; +} PrincipledDiffuseBsdf; + +ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = max(dot(N, L), 0.0f); + float NdotV = max(dot(N, V), 0.0f); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV); + const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness; + float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV); + + float value = M_1_PI_F * NdotL * Fd; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) +{ + const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a; + const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b; + + return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); +} + +ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h new file mode 100644 index 00000000000..f4476bfecd0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_SHEEN_H__ +#define __BSDF_PRINCIPLED_SHEEN_H__ + +/* DISNEY PRINCIPLED SHEEN BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledSheenBsdf { + SHADER_CLOSURE_BASE; +} PrincipledSheenBsdf; + +ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = dot(N, L); + float NdotV = dot(N, V); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float value = schlick_fresnel(LdotH) * NdotL; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b0c5280b6cb..3dc15d5791c 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k return(Rparl2 + Rperp2) * 0.5f; } +ccl_device float schlick_fresnel(float u) +{ + float m = clamp(1.0f - u, 0.0f, 1.0f); + float m2 = m * m; + return m2 * m2 * m; // pow(m, 5) +} + ccl_device float smooth_step(float edge0, float edge1, float x) { float result; @@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x) return result; } +/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */ +ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) { + /* Calculate the fresnel interpolation factor + * The value from fresnel_dielectric_cos(...) has to be normalized because + * the cspec0 keeps the F0 color + */ + float F0_norm = 1.0f / (1.0f - F0); + float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm; + + /* Blend between white and a specular color with respect to the fresnel */ + return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH; +} + CCL_NAMESPACE_END #endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index af0bbd861a9..267aeea6e86 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf { float d; float texture_blur; float albedo; - float3 N; + float roughness; } Bssrdf; /* Planar Truncated Gaussian @@ -348,8 +348,9 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight) { Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight); - if(!bssrdf) + if(bssrdf == NULL) { return NULL; + } float sample_weight = fabsf(average(weight)); bssrdf->sample_weight = sample_weight; @@ -360,10 +361,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) { if(bssrdf->radius < BSSRDF_MIN_RADIUS) { /* revert to diffuse BSDF if radius too small */ - DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; - bsdf->N = bssrdf->N; - int flag = bsdf_diffuse_setup(bsdf); - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + int flag; +#ifdef __PRINCIPLED__ + if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + float roughness = bssrdf->roughness; + float3 N = bssrdf->N; + float3 weight = bssrdf->weight; + float sample_weight = bssrdf->sample_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf; + + bsdf->N = N; + bsdf->roughness = roughness; + bsdf->weight = weight; + bsdf->sample_weight = sample_weight; + flag = bsdf_principled_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + else +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; + bsdf->N = bssrdf->N; + flag = bsdf_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } + return flag; } else { @@ -371,11 +394,13 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) bssrdf->sharpness = saturate(bssrdf->sharpness); bssrdf->type = type; - if(type == CLOSURE_BSSRDF_BURLEY_ID) { + if(type == CLOSURE_BSSRDF_BURLEY_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_ID) + { bssrdf_burley_setup(bssrdf); } - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF; + return SD_BSSRDF; } } @@ -385,7 +410,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_cubic_sample(sc, xi, r, h); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) bssrdf_gaussian_sample(sc, xi, r, h); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ bssrdf_burley_sample(sc, xi, r, h); } @@ -395,7 +420,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) return bssrdf_cubic_pdf(sc, r); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) return bssrdf_gaussian_pdf(sc, r); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ return bssrdf_burley_pdf(sc, r); } diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..6226ed2c2ef --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = fabsf(ccl_get_feature(buffer, 0)); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + const ccl_global float *ccl_restrict transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + const ccl_global float *ccl_restrict p_buffer, + int2 q_pixel, + const ccl_global float *ccl_restrict q_buffer, + int pass_stride, + int rank, + float *design_row, + const ccl_global float *ccl_restrict transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..3ddd8712266 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, + const float *ccl_restrict buffer, + float4 *features, + const float4 *ccl_restrict mean, + int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = fabs(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = features[i] - mean[i]; + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = mask(active_pixels, features[i]); +} + +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, + const float *ccl_restrict buffer, + float4 *scales, + const float4 *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); +} + +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) +{ + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..5e989331bc2 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,180 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; + } + } + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + out_image[y*w+x] += difference_image[y*w+dx+x]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + accum_image[y*w+x] += weight; + out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { + int y = fy + filter_rect.y; + for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, w, h, + pass_stride, + buffer, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int4 rect, + int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] /= accum_image[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..2c5ac807051 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += difference_image[y1*w+x]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + if(out_image) { + accum_image[y*w+x] += sum; + out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + } + else { + accum_image[y*w+x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride, + int localIdx) +{ + int y = fy + filter_rect.y; + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + filter_rect.z*filter_rect.w, + dx, dy, w, h, + pass_stride, + buffer, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, int w) +{ + out_image[y*w+x] /= accum_image[y*w+x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..eefcbfea230 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,215 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if(sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } +} + +ccl_device void kernel_filter_detect_outliers(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *out, + int4 rect, + int pass_stride) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + int idx = (y-rect.y)*buffer_w + (x-rect.x); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); + + float fac = 1.0f; + if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) { + depth[idx] = -depth[idx]; + fac = 0.0f; + } + else { + float L = average(color); + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + float ref = 2.0f*values[(int)(n*0.75f)]; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } + } + out[idx ] = fac*image[idx]; + out[idx + pass_stride] = fac*image[idx + pass_stride]; + out[idx+2*pass_stride] = fac*image[idx+2*pass_stride]; +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..25a3025056c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int w, int h, + int pass_stride, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + if(weight < 1e-3f) { + return; + } + + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); + +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); + + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues, + * fall back to this value. */ + float3 mean_color = XtWY[0]/XtWX[0]; + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + if(!isfinite3_safe(final_color)) { + final_color = mean_color; + } + + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..a5f87c05ec0 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..83a1222bbdb --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + const ccl_global float *ccl_restrict pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..9e65f61664b --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float4 features[DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + float4 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + float4 pixel_scale = make_float4(1.0f / num_pixels); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; + } + + float4 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index 24ced934c8b..f34b77ebc07 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -14,17 +14,20 @@ * limitations under the License. */ -#include "geom_attribute.h" -#include "geom_object.h" +#include "kernel/geom/geom_attribute.h" +#include "kernel/geom/geom_object.h" #ifdef __PATCH_EVAL__ -# include "geom_patch.h" +# include "kernel/geom/geom_patch.h" #endif -#include "geom_triangle.h" -#include "geom_subd_triangle.h" -#include "geom_triangle_intersect.h" -#include "geom_motion_triangle.h" -#include "geom_motion_curve.h" -#include "geom_curve.h" -#include "geom_volume.h" -#include "geom_primitive.h" +#include "kernel/geom/geom_triangle.h" +#include "kernel/geom/geom_subd_triangle.h" +#include "kernel/geom/geom_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle.h" +#include "kernel/geom/geom_motion_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle_shader.h" +#include "kernel/geom/geom_motion_curve.h" +#include "kernel/geom/geom_curve.h" +#include "kernel/geom/geom_curve_intersect.h" +#include "kernel/geom/geom_volume.h" +#include "kernel/geom/geom_primitive.h" diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 08ccee56335..cc62192ef21 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData * ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { return ATTR_PRIM_CURVE; } else @@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found() ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id) { - if(ccl_fetch(sd, object) == PRIM_NONE) { + if(sd->object == PRIM_NONE) { return attribute_not_found(); } /* for SVM, find attribute by unique id */ - uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; + uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; attr_offset += attribute_primitive_type(kg, sd); uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh AttributeDescriptor desc; desc.element = (AttributeElement)attr_map.y; - if(ccl_fetch(sd, prim) == PRIM_NONE && + if(sd->prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH && desc.element != ATTR_ELEMENT_VOXEL && desc.element != ATTR_ELEMENT_OBJECT) diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 84aaaab7453..e35267f02bf 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -16,9 +16,10 @@ CCL_NAMESPACE_BEGIN /* Curve Primitive * - * Curve primitive for rendering hair and fur. These can be render as flat ribbons - * or curves with actual thickness. The curve can also be rendered as line segments - * rather than curves for better performance */ + * Curve primitive for rendering hair and fur. These can be render as flat + * ribbons or curves with actual thickness. The curve can also be rendered as + * line segments rather than curves for better performance. + */ #ifdef __HAIR__ @@ -32,22 +33,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, if(dy) *dy = 0.0f; #endif - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = 0.0f; #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -71,22 +72,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -104,22 +105,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) { float r = 0.0f; - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + if(sd->type & PRIMITIVE_ALL_CURVE) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } - r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w; + r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; } return r*2.0f; @@ -130,8 +131,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; @@ -139,23 +140,23 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u)); + return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u); } /* Curve tangent normal */ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) -{ +{ float3 tgN = make_float3(0.0f,0.0f,0.0f); - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { - tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu)))); + tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); tgN = normalize(tgN); /* need to find suitable scaled gd for corrected normal */ #if 0 - tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu)); + tgN = normalize(tgN - gd * sd->dPdu); #endif } @@ -213,817 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } } -#ifdef __KERNEL_SSE2__ -ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) -{ - return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); -} -#endif - -#ifdef __KERNEL_SSE2__ -/* Pass P and dir by reference to aligned vector */ -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -#else -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) -#endif -{ - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - float epsilon = 0.0f; - float r_st, r_en; - - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); - -#ifdef __KERNEL_SSE2__ - ssef vdir = load4f(dir); - ssef vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; - - { - ssef dtmp = vdir * vdir; - ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); - ssef rd_ss = load1f_first(1.0f) / d_ss; - - ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - - ssef P_curve[4]; - - if(type & PRIMITIVE_CURVE) { - P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); - P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); - } - - ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); - ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; - ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; - ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - - ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); - ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - - ssef htfm[] = { htfm0, htfm1, htfm2 }; - ssef vP = load4f(P); - ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); - ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); - ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); - ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); - - float fc = 0.71f; - ssef vfc = ssef(fc); - ssef vfcxp3 = vfc * p3; - - vcurve_coef[0] = p1; - vcurve_coef[1] = vfc * (p2 - p0); - vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); - - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; - } -#else - float3 curve_coef[4]; - - /* curve Intersection check */ - /* obtain curve parameters */ - { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform( - dir.z / d, 0, -dir.x /d, 0, - -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, - dir.x, dir.y, dir.z, 0, - 0, 0, 0, 1); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); - } - - float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc*p0 + fc*p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P_curve[1].w; - r_en = P_curve[2].w; - } -#endif - - float r_curr = max(r_st, r_en); - - if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); - if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; - - /* minimum width extension */ - float mw_extension = min(difl * fabsf(upper), extmax); - float r_ext = mw_extension + r_curr; - - float xextrem[4]; - curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); - if(lower > r_ext || upper < -r_ext) - return false; - - float yextrem[4]; - curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); - if(lower > r_ext || upper < -r_ext) - return false; - - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while(!(tree >> (depth))) { - float i_st = tree * resol; - float i_en = i_st + (level * resol); -#ifdef __KERNEL_SSE2__ - ssef vi_st = ssef(i_st), vi_en = ssef(i_en); - ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - - ssef vbmin = min(vp_st, vp_en); - ssef vbmax = max(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -#else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -#endif - - if(xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx,xextrem[1]); - bmaxx = max(bmaxx,xextrem[1]); - } - if(xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx,xextrem[3]); - bmaxx = max(bmaxx,xextrem[3]); - } - if(yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy,yextrem[1]); - bmaxy = max(bmaxy,yextrem[1]); - } - if(yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy,yextrem[3]); - bmaxy = max(bmaxy,yextrem[3]); - } - if(zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz,zextrem[1]); - bmaxz = max(bmaxz,zextrem[1]); - } - if(zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz,zextrem[3]); - bmaxz = max(bmaxz,zextrem[3]); - } - - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); - - mw_extension = min(difl * fabsf(bmaxz), extmax); - float r_ext = mw_extension + r_curr; - float coverage = 1.0f; - - if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; - } - else if(level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - float gd = 0.0f; - - if(flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); - float w = tg.x * tg.x + tg.y * tg.y; - if(w == 0) { - tree++; - level = tree & -tree; - continue; - } - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; - w = saturate(w); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } - - /* compute coverage */ - float r_ext = r_curr; - coverage = 1.0f; - if(difl != 0.0f) { - mw_extension = min(difl * fabsf(bmaxz), extmax); - r_ext = mw_extension + r_curr; - float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); - float d0 = d - r_curr; - float d1 = d + r_curr; - float inv_mw_extension = 1.0f/mw_extension; - if(d0 >= 0) - coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; - else // inside - coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; - } - - if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } - - t = p_curr.z; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - else { - float l = len(p_en - p_st); - /* minimum width extension */ - float or1 = r1; - float or2 = r2; - - if(difl != 0.0f) { - mw_extension = min(len(p_st - P) * difl, extmax); - or1 = r1 < mw_extension ? mw_extension : r1; - mw_extension = min(len(p_en - P) * difl, extmax); - or2 = r2 < mw_extension ? mw_extension : r2; - } - /* --- */ - float invl = 1.0f/l; - float3 tg = (p_en - p_st) * invl; - gd = (or2 - or1) * invl; - float difz = -dot(p_st,tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); - float invcyla = 1.0f/cyla; - float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); - float tcentre = -halfb*invcyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = - p_st; - tdif.z += tcentre; - float tdifz = dot(tdif,tg); - float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); - float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; - float td = tb*tb - 4*cyla*tc; - if(td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float rootd = sqrtf(td); - float correction = (-tb - rootd) * 0.5f * invcyla; - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - - if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { - correction = (-tb + rootd) * 0.5f * invcyla; - t = tcentre + correction; - } - - if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float w = (zcentre + (tg.z * correction)) * invl; - w = saturate(w); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - r_curr = r1 + (r2 - r1) * w; - r_ext = or1 + (or2 - or1) * w; - coverage = r_curr/r_ext; - - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - /* we found a new intersection */ - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = u; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - hit = true; - } - - tree++; - level = tree & -tree; - } - else { - /* split the curve into two curves and process */ - level = level >> 1; - } - } - - return hit; -} - -ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -{ - /* define few macros to minimize code duplication for SSE */ -#ifndef __KERNEL_SSE2__ -# define len3_squared(x) len_squared(x) -# define len3(x) len(x) -# define dot3(x, y) dot(x, y) -#endif - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - - int prim = kernel_tex_fetch(__prim_index, curveAddr); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; - int k1 = k0 + 1; - -#ifndef __KERNEL_SSE2__ - float4 P_curve[2]; - - if(type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); - } - - float or1 = P_curve[0].w; - float or2 = P_curve[1].w; - float3 p1 = float4_to_float3(P_curve[0]); - float3 p2 = float4_to_float3(P_curve[1]); - - /* minimum width extension */ - float r1 = or1; - float r2 = or2; - float3 dif = P - p1; - float3 dif_second = P - p2; - if(difl != 0.0f) { - float pixelsize = min(len3(dif) * difl, extmax); - r1 = or1 < pixelsize ? pixelsize : or1; - pixelsize = min(len3(dif_second) * difl, extmax); - r2 = or2 < pixelsize ? pixelsize : or2; - } - /* --- */ - - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float3 dir = direction; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -#else - ssef P_curve[2]; - - if(type & PRIMITIVE_CURVE) { - P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); - } - - const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - - ssef r12 = or12; - const ssef vP = load4f(P); - const ssef dif = vP - P_curve[0]; - const ssef dif_second = vP - P_curve[1]; - if(difl != 0.0f) { - const ssef len1_sq = len3_squared_splat(dif); - const ssef len2_sq = len3_squared_splat(dif_second); - const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); - r12 = max(or12, pixelsize12); - } - float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); - float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); - - const ssef p21_diff = P_curve[1] - P_curve[0]; - const ssef sphere_dif1 = (dif + dif_second) * 0.5f; - const ssef dir = load4f(direction); - const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); -#endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if(sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -#ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -#else - const ssef tg = p21_diff * invl; -#endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz*dirz*(1 + gd*gd)); - - float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); - - float tcentre = -halfb/a; - float zcentre = difz + (dirz * tcentre); - - if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return false; - - /* test minimum separation */ -#ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -#else - const ssef cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); -#endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if(cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled*distscaled)/cprodsq; - - if(distscaled > mr*mr) - return false; - - /* calculate true intersection */ -#ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -#else - const ssef tdif = madd(ssef(tcentre), dir, dif); -#endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz*gd + r1; - float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); - float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; - float td = tb*tb - 4*a*tc; - - if(td < 0.0f) - return false; - - float rootd = 0.0f; - float correction = 0.0f; - if(flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - } - - float t = tcentre + correction; - - if(t < isect->t) { - - if(flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - // bool backface = false; - - if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - // backface = true; - correction = ((-tb + rootd)/(2*a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); - } - - /* stochastic fade from minimum width */ - float adjradius = or1 + z * (or2 - or1) * invl; - adjradius = adjradius / (r1 + z * gd); - if(lcg_state && adjradius != 1.0f) { - if(lcg_step_float(lcg_state) > adjradius) - return false; - } - /* --- */ - - if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if(flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; - if(a2*c2 < 0.0f) - return false; - } - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = z*invl; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - - return true; - } - } - } - - return false; - -#ifndef __KERNEL_SSE2__ -# undef len3_squared -# undef len3 -# undef dot3 -# endif -} - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - int flag = kernel_data.curve.curveflags; - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - int prim = kernel_tex_fetch(__prim_index, isect->prim); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); - int k1 = k0 + 1; - - float3 tg; - - if(flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve); - } - - float3 p[4]; - p[0] = float4_to_float3(P_curve[0]); - p[1] = float4_to_float3(P_curve[1]); - p[2] = float4_to_float3(P_curve[2]); - p[3] = float4_to_float3(P_curve[3]); - - P = P + D*t; - -#ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = 0.0f; -#endif - - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); - - if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D)))); - } - else { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - ccl_fetch(sd, Ng) = normalize(P - p_curr); - - /* adjustment for changing radius */ - float gd = isect->v; - - if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - } - - /* todo: sometimes the normal is still so that this is detected as - * backfacing even if cull backfaces is enabled */ - - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - } - else { - float4 P_curve[2]; - - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { - P_curve[0]= kernel_tex_fetch(__curve_keys, k0); - P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - } - else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); - } - - float l = 1.0f; - tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); - - P = P + D*t; - - float3 dif = P - float4_to_float3(P_curve[0]); - -#ifdef __UV__ - ccl_fetch(sd, u) = dot(dif,tg)/l; - ccl_fetch(sd, v) = 0.0f; -#endif - - if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D)); - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - else { - float gd = isect->v; - - /* direction from inside to surface of curve */ - ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd); - - /* adjustment for changing radius */ - if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - } - - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - } - -#ifdef __DPDU__ - /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = tg; - ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng)); -#endif - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -} - -#endif +#endif /* __HAIR__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h new file mode 100644 index 00000000000..e9a149ea1ab --- /dev/null +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -0,0 +1,934 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Curve primitive intersection functions. */ + +#ifdef __HAIR__ + +#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) +# define ccl_device_curveintersect ccl_device +#else +# define ccl_device_curveintersect ccl_device_forceinline +#endif + +#ifdef __KERNEL_SSE2__ +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) +{ + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); +} +#endif + +/* On CPU pass P and dir by reference to aligned vector. */ +ccl_device_curveintersect bool cardinal_curve_intersect( + KernelGlobals *kg, + Intersection *isect, + const float3 ccl_ref P, + const float3 ccl_ref dir, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + float epsilon = 0.0f; + float r_st, r_en; + + int depth = kernel_data.curve.subdivisions; + int flags = kernel_data.curve.curveflags; + int prim = kernel_tex_fetch(__prim_index, curveAddr); + +#ifdef __KERNEL_SSE2__ + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; + const float3 *curve_coef = (float3 *)vcurve_coef; + + { + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; + + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); + int2 &v00 = (int2 &)v00vec; + + int k0 = v00.x + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, v00.x); + int kb = min(k1 + 1, v00.x + v00.y - 1); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + avxf P_curve_0_1, P_curve_2_3; + if(is_curve_primitive) { + P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); + P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; + motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); + } +#else /* __KERNEL_AVX2__ */ + ssef P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); + } +#endif /* __KERNEL_AVX2__ */ + + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); + + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + const avxf vPP = _mm256_broadcast_ps(&P.m128); + const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); + const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); + const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); + + const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), + htfm00, + madd(shuffle<1>(P_curve_0_1 - vPP), + htfm11, + shuffle<2>(P_curve_0_1 - vPP) * htfm22)); + const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), + htfm00, + madd(shuffle<1>(P_curve_2_3 - vPP), + htfm11, + shuffle<2>(P_curve_2_3 - vPP)*htfm22)); + + const ssef p0 = _mm256_castps256_ps128(p01); + const ssef p1 = _mm256_extractf128_ps(p01, 1); + const ssef p2 = _mm256_castps256_ps128(p23); + const ssef p3 = _mm256_extractf128_ps(p23, 1); + + const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); + r_st = ((float4 &)P_curve_1).w; + const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); + r_en = ((float4 &)P_curve_2).w; +#else /* __KERNEL_AVX2__ */ + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); + + r_st = ((float4 &)P_curve[1]).w; + r_en = ((float4 &)P_curve[2]).w; +#endif /* __KERNEL_AVX2__ */ + + float fc = 0.71f; + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; + + vcurve_coef[0] = p1; + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); + + } +#else + float3 curve_coef[4]; + + /* curve Intersection check */ + /* obtain curve parameters */ + { + /* ray transform created - this should be created at beginning of intersection loop */ + Transform htfm; + float d = sqrtf(dir.x * dir.x + dir.z * dir.z); + htfm = make_transform( + dir.z / d, 0, -dir.x /d, 0, + -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, + dir.x, dir.y, dir.z, 0, + 0, 0, 0, 1); + + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + segment; + int k1 = k0 + 1; + + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); + } + + float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); + float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); + float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); + float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); + + float fc = 0.71f; + curve_coef[0] = p1; + curve_coef[1] = -fc*p0 + fc*p2; + curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; + curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; + r_st = P_curve[1].w; + r_en = P_curve[2].w; + } +#endif + + float r_curr = max(r_st, r_en); + + if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) + epsilon = 2 * r_curr; + + /* find bounds - this is slow for cubic curves */ + float upper, lower; + + float zextrem[4]; + curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); + if(lower - r_curr > isect->t || upper + r_curr < epsilon) + return false; + + /* minimum width extension */ + float mw_extension = min(difl * fabsf(upper), extmax); + float r_ext = mw_extension + r_curr; + + float xextrem[4]; + curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); + if(lower > r_ext || upper < -r_ext) + return false; + + float yextrem[4]; + curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); + if(lower > r_ext || upper < -r_ext) + return false; + + /* setup recurrent loop */ + int level = 1 << depth; + int tree = 0; + float resol = 1.0f / (float)level; + bool hit = false; + + /* begin loop */ + while(!(tree >> (depth))) { + const float i_st = tree * resol; + const float i_en = i_st + (level * resol); + +#ifdef __KERNEL_SSE2__ + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); + + float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; + float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; + float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; + float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; +#else + float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; + float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; + + float bminx = min(p_st.x, p_en.x); + float bmaxx = max(p_st.x, p_en.x); + float bminy = min(p_st.y, p_en.y); + float bmaxy = max(p_st.y, p_en.y); + float bminz = min(p_st.z, p_en.z); + float bmaxz = max(p_st.z, p_en.z); +#endif + + if(xextrem[0] >= i_st && xextrem[0] <= i_en) { + bminx = min(bminx,xextrem[1]); + bmaxx = max(bmaxx,xextrem[1]); + } + if(xextrem[2] >= i_st && xextrem[2] <= i_en) { + bminx = min(bminx,xextrem[3]); + bmaxx = max(bmaxx,xextrem[3]); + } + if(yextrem[0] >= i_st && yextrem[0] <= i_en) { + bminy = min(bminy,yextrem[1]); + bmaxy = max(bmaxy,yextrem[1]); + } + if(yextrem[2] >= i_st && yextrem[2] <= i_en) { + bminy = min(bminy,yextrem[3]); + bmaxy = max(bmaxy,yextrem[3]); + } + if(zextrem[0] >= i_st && zextrem[0] <= i_en) { + bminz = min(bminz,zextrem[1]); + bmaxz = max(bmaxz,zextrem[1]); + } + if(zextrem[2] >= i_st && zextrem[2] <= i_en) { + bminz = min(bminz,zextrem[3]); + bmaxz = max(bmaxz,zextrem[3]); + } + + float r1 = r_st + (r_en - r_st) * i_st; + float r2 = r_st + (r_en - r_st) * i_en; + r_curr = max(r1, r2); + + mw_extension = min(difl * fabsf(bmaxz), extmax); + float r_ext = mw_extension + r_curr; + float coverage = 1.0f; + + if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + /* the bounding box does not overlap the square centered at O */ + tree += level; + level = tree & -tree; + } + else if(level == 1) { + + /* the maximum recursion depth is reached. + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ + float t = isect->t; + float u = 0.0f; + float gd = 0.0f; + + if(flags & CURVE_KN_RIBBONS) { + float3 tg = (p_en - p_st); +#ifdef __KERNEL_SSE__ + const float3 tg_sq = tg * tg; + float w = tg_sq.x + tg_sq.y; +#else + float w = tg.x * tg.x + tg.y * tg.y; +#endif + if(w == 0) { + tree++; + level = tree & -tree; + continue; + } +#ifdef __KERNEL_SSE__ + const float3 p_sttg = p_st * tg; + w = -(p_sttg.x + p_sttg.y) / w; +#else + w = -(p_st.x * tg.x + p_st.y * tg.y) / w; +#endif + w = saturate(w); + + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r_st + (r_en - r_st) * u; + /* compare x-y distances */ + float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + tree++; + level = tree & -tree; + continue; + } + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + tree++; + level = tree & -tree; + continue; + } + + /* compute coverage */ + float r_ext = r_curr; + coverage = 1.0f; + if(difl != 0.0f) { + mw_extension = min(difl * fabsf(bmaxz), extmax); + r_ext = mw_extension + r_curr; +#ifdef __KERNEL_SSE__ + const float3 p_curr_sq = p_curr * p_curr; + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); + float d = dxxx.x; +#else + float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); +#endif + float d0 = d - r_curr; + float d1 = d + r_curr; + float inv_mw_extension = 1.0f/mw_extension; + if(d0 >= 0) + coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; + else // inside + coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; + } + + if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + tree++; + level = tree & -tree; + continue; + } + + t = p_curr.z; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + else { + float l = len(p_en - p_st); + /* minimum width extension */ + float or1 = r1; + float or2 = r2; + + if(difl != 0.0f) { + mw_extension = min(len(p_st - P) * difl, extmax); + or1 = r1 < mw_extension ? mw_extension : r1; + mw_extension = min(len(p_en - P) * difl, extmax); + or2 = r2 < mw_extension ? mw_extension : r2; + } + /* --- */ + float invl = 1.0f/l; + float3 tg = (p_en - p_st) * invl; + gd = (or2 - or1) * invl; + float difz = -dot(p_st,tg); + float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); + float invcyla = 1.0f/cyla; + float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); + float tcentre = -halfb*invcyla; + float zcentre = difz + (tg.z * tcentre); + float3 tdif = - p_st; + tdif.z += tcentre; + float tdifz = dot(tdif,tg); + float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); + float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; + float td = tb*tb - 4*cyla*tc; + if(td < 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float rootd = sqrtf(td); + float correction = (-tb - rootd) * 0.5f * invcyla; + t = tcentre + correction; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + + if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { + correction = (-tb + rootd) * 0.5f * invcyla; + t = tcentre + correction; + } + + if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float w = (zcentre + (tg.z * correction)) * invl; + w = saturate(w); + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + r_curr = r1 + (r2 - r1) * w; + r_ext = or1 + (or2 - or1) * w; + coverage = r_curr/r_ext; + + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + /* we found a new intersection */ + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + hit = true; + } + + tree++; + level = tree & -tree; + } + else { + /* split the curve into two curves and process */ + level = level >> 1; + } + } + + return hit; +} + +ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 direction, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + /* define few macros to minimize code duplication for SSE */ +#ifndef __KERNEL_SSE2__ +# define len3_squared(x) len_squared(x) +# define len3(x) len(x) +# define dot3(x, y) dot(x, y) +#endif + + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + /* curve Intersection check */ + int flags = kernel_data.curve.curveflags; + + int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int cnum = __float_as_int(v00.x); + int k0 = cnum + segment; + int k1 = k0 + 1; + +#ifndef __KERNEL_SSE2__ + float4 P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, k0); + P_curve[1] = kernel_tex_fetch(__curve_keys, k1); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); + } + + float or1 = P_curve[0].w; + float or2 = P_curve[1].w; + float3 p1 = float4_to_float3(P_curve[0]); + float3 p2 = float4_to_float3(P_curve[1]); + + /* minimum width extension */ + float r1 = or1; + float r2 = or2; + float3 dif = P - p1; + float3 dif_second = P - p2; + if(difl != 0.0f) { + float pixelsize = min(len3(dif) * difl, extmax); + r1 = or1 < pixelsize ? pixelsize : or1; + pixelsize = min(len3(dif_second) * difl, extmax); + r2 = or2 < pixelsize ? pixelsize : or2; + } + /* --- */ + + float3 p21_diff = p2 - p1; + float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; + float sphere_b_tmp = dot3(dir, sphere_dif1); + float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; +#else + ssef P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); + } + + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; + if(difl != 0.0f) { + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); + } + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); +#endif + + float mr = max(r1, r2); + float l = len3(p21_diff); + float invl = 1.0f / l; + float sp_r = mr + 0.5f * l; + + float sphere_b = dot3(dir, sphere_dif2); + float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; + + if(sdisc < 0.0f) + return false; + + /* obtain parameters and test midpoint distance for suitable modes */ +#ifndef __KERNEL_SSE2__ + float3 tg = p21_diff * invl; +#else + const ssef tg = p21_diff * invl; +#endif + float gd = (r2 - r1) * invl; + + float dirz = dot3(dir, tg); + float difz = dot3(dif, tg); + + float a = 1.0f - (dirz*dirz*(1 + gd*gd)); + + float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); + + float tcentre = -halfb/a; + float zcentre = difz + (dirz * tcentre); + + if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) + return false; + if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) + return false; + + /* test minimum separation */ +#ifndef __KERNEL_SSE2__ + float3 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross(tg, dif)); +#else + const ssef cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross_zxy(tg, dif)); +#endif + float cprodsq = len3_squared(cprod); + float distscaled = dot3(cprod, dif); + + if(cprodsq == 0) + distscaled = cprod2sq; + else + distscaled = (distscaled*distscaled)/cprodsq; + + if(distscaled > mr*mr) + return false; + + /* calculate true intersection */ +#ifndef __KERNEL_SSE2__ + float3 tdif = dif + tcentre * dir; +#else + const ssef tdif = madd(ssef(tcentre), dir, dif); +#endif + float tdifz = dot3(tdif, tg); + float tdifma = tdifz*gd + r1; + float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); + float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; + float td = tb*tb - 4*a*tc; + + if(td < 0.0f) + return false; + + float rootd = 0.0f; + float correction = 0.0f; + if(flags & CURVE_KN_ACCURATE) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + } + + float t = tcentre + correction; + + if(t < isect->t) { + + if(flags & CURVE_KN_INTERSECTCORRECTION) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + t = tcentre + correction; + } + + float z = zcentre + (dirz * correction); + // bool backface = false; + + if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { + // backface = true; + correction = ((-tb + rootd)/(2*a)); + t = tcentre + correction; + z = zcentre + (dirz * correction); + } + + /* stochastic fade from minimum width */ + float adjradius = or1 + z * (or2 - or1) * invl; + adjradius = adjradius / (r1 + z * gd); + if(lcg_state && adjradius != 1.0f) { + if(lcg_step_float(lcg_state) > adjradius) + return false; + } + /* --- */ + + if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { + + if(flags & CURVE_KN_ENCLOSEFILTER) { + float enc_ratio = 1.01f; + if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { + float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); + float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; + if(a2*c2 < 0.0f) + return false; + } + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + + return true; + } + } + } + + return false; + +#ifndef __KERNEL_SSE2__ +# undef len3_squared +# undef len3 +# undef dot3 +#endif +} + +ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float fc = 0.71f; + float data[4]; + float t2 = t * t; + data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; + data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; + data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; + data[3] = 3.0f * fc * t2 - 2.0f * fc * t; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float data[4]; + float fc = 0.71f; + float t2 = t * t; + float t3 = t2 * t; + data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; + data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; + data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; + data[3] = fc * t3 - fc * t2; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curve_refine(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) +{ + int flag = kernel_data.curve.curveflags; + float t = isect->t; + float3 P = ray->P; + float3 D = ray->D; + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + int prim = kernel_tex_fetch(__prim_index, isect->prim); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k1 = k0 + 1; + + float3 tg; + + if(flag & CURVE_KN_INTERPOLATE) { + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + } + + float3 p[4]; + p[0] = float4_to_float3(P_curve[0]); + p[1] = float4_to_float3(P_curve[1]); + p[2] = float4_to_float3(P_curve[2]); + p[3] = float4_to_float3(P_curve[3]); + + P = P + D*t; + +#ifdef __UV__ + sd->u = isect->u; + sd->v = 0.0f; +#endif + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + } + else { + /* direction from inside to surface of curve */ + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); + + /* adjustment for changing radius */ + float gd = isect->v; + + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + /* todo: sometimes the normal is still so that this is detected as + * backfacing even if cull backfaces is enabled */ + + sd->N = sd->Ng; + } + else { + float4 P_curve[2]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0]= kernel_tex_fetch(__curve_keys, k0); + P_curve[1]= kernel_tex_fetch(__curve_keys, k1); + } + else { + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + } + + float l = 1.0f; + tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); + + P = P + D*t; + + float3 dif = P - float4_to_float3(P_curve[0]); + +#ifdef __UV__ + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; +#endif + + if(flag & CURVE_KN_TRUETANGENTGNORMAL) { + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); + } + else { + float gd = isect->v; + + /* direction from inside to surface of curve */ + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); + + /* adjustment for changing radius */ + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + sd->N = sd->Ng; + } + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); +#endif + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +} + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h index 6de5aa7ea99..119bdb2f15c 100644 --- a/intern/cycles/kernel/geom/geom_motion_curve.h +++ b/intern/cycles/kernel/geom/geom_motion_curve.h @@ -50,12 +50,12 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object, ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, float4 keys[2]) { if(step == numsteps) { - /* center step: regular vertex location */ + /* center step: regular key location */ keys[0] = kernel_tex_fetch(__curve_keys, k0); keys[1] = kernel_tex_fetch(__curve_keys, k1); } else { - /* center step not stored in this array */ + /* center step is not stored in this array */ if(step > numsteps) step--; @@ -97,14 +97,14 @@ ccl_device_inline void motion_curve_keys(KernelGlobals *kg, int object, int prim ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, int k2, int k3, float4 keys[4]) { if(step == numsteps) { - /* center step: regular vertex location */ + /* center step: regular key location */ keys[0] = kernel_tex_fetch(__curve_keys, k0); keys[1] = kernel_tex_fetch(__curve_keys, k1); keys[2] = kernel_tex_fetch(__curve_keys, k2); keys[3] = kernel_tex_fetch(__curve_keys, k3); } else { - /* center step not store in this array */ + /* center step is not stored in this array */ if(step > numsteps) step--; @@ -118,7 +118,12 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, in } /* return 2 curve key locations */ -ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4]) +ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, + int object, + int prim, + float time, + int k0, int k1, int k2, int k3, + float4 keys[4]) { /* get motion info */ int numsteps, numkeys; @@ -147,6 +152,65 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, keys[3] = (1.0f - t)*keys[3] + t*next_keys[3]; } +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) +/* Similar to above, but returns keys as pair of two AVX registers with each + * holding two float4. + */ +ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg, + int object, + int prim, + float time, + int k0, int k1, + int k2, int k3, + avxf *out_keys_0_1, + avxf *out_keys_2_3) +{ + /* Get motion info. */ + int numsteps, numkeys; + object_motion_info(kg, object, &numsteps, NULL, &numkeys); + + /* Figure out which steps we need to fetch and their interpolation factor. */ + int maxstep = numsteps * 2; + int step = min((int)(time*maxstep), maxstep - 1); + float t = time*maxstep - step; + + /* Find attribute. */ + AttributeElement elem; + int offset = find_attribute_curve_motion(kg, + object, + ATTR_STD_MOTION_VERTEX_POSITION, + &elem); + kernel_assert(offset != ATTR_STD_NOT_FOUND); + + /* Fetch key coordinates. */ + float4 next_keys[4]; + float4 keys[4]; + motion_cardinal_curve_keys_for_step(kg, + offset, + numkeys, + numsteps, + step, + k0, k1, k2, k3, + keys); + motion_cardinal_curve_keys_for_step(kg, + offset, + numkeys, + numsteps, + step + 1, + k0, k1, k2, k3, + next_keys); + + const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128); + const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128); + const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128); + const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128); + + /* Interpolate between steps. */ + *out_keys_0_1 = (1.0f - t) * keys_0_1 + t*next_keys_0_1; + *out_keys_2_3 = (1.0f - t) * keys_2_3 + t*next_keys_2_3; +} +#endif + #endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index 3cbe59aaece..4e84aa97776 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -76,7 +76,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4 normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); } else { - /* center step not stored in this array */ + /* center step is not stored in this array */ if(step > numsteps) step--; @@ -117,312 +117,4 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i verts[2] = (1.0f - t)*verts[2] + t*next_verts[2]; } -/* Refine triangle intersection to more precise hit point. For rays that travel - * far the precision is often not so good, this reintersects the primitive from - * a closer distance. */ - -ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3]) -{ - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - -#ifdef __INTERSECTION_REFINE__ - if(isect->object != OBJECT_NONE) { - if(UNLIKELY(t == 0.0f)) { - return P; - } -# ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - P = P + D*t; - - /* compute refined intersection distance */ - const float3 e1 = verts[0] - verts[2]; - const float3 e2 = verts[1] - verts[2]; - const float3 s1 = cross(D, e2); - - const float invdivisor = 1.0f/dot(s1, e1); - const float3 d = P - verts[2]; - const float3 s2 = cross(d, e1); - float rt = dot(e2, s2)*invdivisor; - - /* compute refined position */ - P = P + D*rt; - - if(isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - - P = transform_point(&tfm, P); - } - - return P; -#else - return P + D*t; -#endif -} - -/* Same as above, except that isect->t is assumed to be in object space for instancing */ - -#ifdef __SUBSURFACE__ -# if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)) -ccl_device_noinline -# else -ccl_device_inline -# endif -float3 motion_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3]) -{ - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - -# ifdef __INTERSECTION_REFINE__ - if(isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D); - D = normalize(D); - } - - P = P + D*t; - - /* compute refined intersection distance */ - const float3 e1 = verts[0] - verts[2]; - const float3 e2 = verts[1] - verts[2]; - const float3 s1 = cross(D, e2); - - const float invdivisor = 1.0f/dot(s1, e1); - const float3 d = P - verts[2]; - const float3 s2 = cross(d, e1); - float rt = dot(e2, s2)*invdivisor; - - P = P + D*rt; - - if(isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - - P = transform_point(&tfm, P); - } - - return P; -# else - return P + D*t; -# endif -} -#endif - -/* Setup of motion triangle specific parts of ShaderData, moved into this one - * function to more easily share computation of interpolated positions and - * normals */ - -/* return 3 triangle vertex normals */ -ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface) -{ - /* get shader */ - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); - - /* get motion info */ - int numsteps, numverts; - object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); - - /* figure out which steps we need to fetch and their interpolation factor */ - int maxstep = numsteps*2; - int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); - float t = ccl_fetch(sd, time)*maxstep - step; - - /* find attribute */ - AttributeElement elem; - int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem); - kernel_assert(offset != ATTR_STD_NOT_FOUND); - - /* fetch vertex coordinates */ - float3 verts[3], next_verts[3]; - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); - - motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); - motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); - - /* interpolate between steps */ - verts[0] = (1.0f - t)*verts[0] + t*next_verts[0]; - verts[1] = (1.0f - t)*verts[1] + t*next_verts[1]; - verts[2] = (1.0f - t)*verts[2] + t*next_verts[2]; - - /* compute refined position */ -#ifdef __SUBSURFACE__ - if(!subsurface) -#endif - ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); -#ifdef __SUBSURFACE__ - else - ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts); -#endif - - /* compute face normal */ - float3 Ng; - if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) - Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); - else - Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); - - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; - - /* compute derivatives of P w.r.t. uv */ -#ifdef __DPDU__ - ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); - ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); -#endif - - /* compute smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { - /* find attribute */ - AttributeElement elem; - int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem); - kernel_assert(offset != ATTR_STD_NOT_FOUND); - - /* fetch vertex coordinates */ - float3 normals[3], next_normals[3]; - motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals); - motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals); - - /* interpolate between steps */ - normals[0] = (1.0f - t)*normals[0] + t*next_normals[0]; - normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; - normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; - - /* interpolate between vertices */ - float u = ccl_fetch(sd, u); - float v = ccl_fetch(sd, v); - float w = 1.0f - u - v; - ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); - } -} - -/* Ray intersection. We simply compute the vertex positions at the given ray - * time and do a ray intersection with the resulting triangle */ - -ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 dir, float time, uint visibility, int object, int triAddr) -{ - /* primitive index for vertex location lookup */ - int prim = kernel_tex_fetch(__prim_index, triAddr); - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object; - - /* get vertex locations for intersection */ - float3 verts[3]; - motion_triangle_vertices(kg, fobject, prim, time, verts); - - /* ray-triangle intersection, unoptimized */ - float t, u, v; - - if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) -#endif - { - isect->t = t; - isect->u = u; - isect->v = v; - isect->prim = triAddr; - isect->object = object; - isect->type = PRIMITIVE_MOTION_TRIANGLE; - - return true; - } - } - - return false; -} - -/* Special ray intersection routines for subsurface scattering. In that case we - * only want to intersect with primitives in the same object, and if case of - * multiple hits we pick a single random primitive as the intersection point. */ - -#ifdef __SUBSURFACE__ -ccl_device_inline void motion_triangle_intersect_subsurface( - KernelGlobals *kg, - SubsurfaceIntersection *ss_isect, - float3 P, - float3 dir, - float time, - int object, - int triAddr, - float tmax, - uint *lcg_state, - int max_hits) -{ - /* primitive index for vertex location lookup */ - int prim = kernel_tex_fetch(__prim_index, triAddr); - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object; - - /* get vertex locations for intersection */ - float3 verts[3]; - motion_triangle_vertices(kg, fobject, prim, time, verts); - - /* ray-triangle intersection, unoptimized */ - float t, u, v; - - if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) { - for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { - if(ss_isect->hits[i].t == t) { - return; - } - } - - ss_isect->num_hits++; - - int hit; - - if(ss_isect->num_hits <= max_hits) { - hit = ss_isect->num_hits - 1; - } - else { - /* reservoir sampling: if we are at the maximum number of - * hits, randomly replace element or skip it */ - hit = lcg_step_uint(lcg_state) % ss_isect->num_hits; - - if(hit >= max_hits) - return; - } - - /* record intersection */ - Intersection *isect = &ss_isect->hits[hit]; - isect->t = t; - isect->u = u; - isect->v = v; - isect->prim = triAddr; - isect->object = object; - isect->type = PRIMITIVE_MOTION_TRIANGLE; - - /* Record geometric normal. */ - ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0], - verts[2] - verts[0])); - } -} -#endif - CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h new file mode 100644 index 00000000000..f74995becf5 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -0,0 +1,289 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Motion Triangle Primitive + * + * These are stored as regular triangles, plus extra positions and normals at + * times other than the frame center. Computing the triangle vertex positions + * or normals at a given ray time is a matter of interpolation of the two steps + * between which the ray time lies. + * + * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION + * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes. + */ + +CCL_NAMESPACE_BEGIN + +/* Refine triangle intersection to more precise hit point. For rays that travel + * far the precision is often not so good, this reintersects the primitive from + * a closer distance. + */ + +ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray, + float3 verts[3]) +{ + float3 P = ray->P; + float3 D = ray->D; + float t = isect->t; + +#ifdef __INTERSECTION_REFINE__ + if(isect->object != OBJECT_NONE) { + if(UNLIKELY(t == 0.0f)) { + return P; + } +# ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +# else + Transform tfm = object_fetch_transform(kg, + isect->object, + OBJECT_INVERSE_TRANSFORM); +# endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + P = P + D*t; + + /* Compute refined intersection distance. */ + const float3 e1 = verts[0] - verts[2]; + const float3 e2 = verts[1] - verts[2]; + const float3 s1 = cross(D, e2); + + const float invdivisor = 1.0f/dot(s1, e1); + const float3 d = P - verts[2]; + const float3 s2 = cross(d, e1); + float rt = dot(e2, s2)*invdivisor; + + /* Compute refined position. */ + P = P + D*rt; + + if(isect->object != OBJECT_NONE) { +# ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +# else + Transform tfm = object_fetch_transform(kg, + isect->object, + OBJECT_TRANSFORM); +# endif + + P = transform_point(&tfm, P); + } + + return P; +#else + return P + D*t; +#endif +} + +/* Same as above, except that isect->t is assumed to be in object space + * for instancing. + */ + +#ifdef __SUBSURFACE__ +# if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)) +ccl_device_noinline +# else +ccl_device_inline +# endif +float3 motion_triangle_refine_subsurface(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray, + float3 verts[3]) +{ + float3 P = ray->P; + float3 D = ray->D; + float t = isect->t; + +# ifdef __INTERSECTION_REFINE__ + if(isect->object != OBJECT_NONE) { +# ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +# else + Transform tfm = object_fetch_transform(kg, + isect->object, + OBJECT_INVERSE_TRANSFORM); +# endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D); + D = normalize(D); + } + + P = P + D*t; + + /* compute refined intersection distance */ + const float3 e1 = verts[0] - verts[2]; + const float3 e2 = verts[1] - verts[2]; + const float3 s1 = cross(D, e2); + + const float invdivisor = 1.0f/dot(s1, e1); + const float3 d = P - verts[2]; + const float3 s2 = cross(d, e1); + float rt = dot(e2, s2)*invdivisor; + + P = P + D*rt; + + if(isect->object != OBJECT_NONE) { +# ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +# else + Transform tfm = object_fetch_transform(kg, + isect->object, + OBJECT_TRANSFORM); +# endif + + P = transform_point(&tfm, P); + } + + return P; +# else /* __INTERSECTION_REFINE__ */ + return P + D*t; +# endif /* __INTERSECTION_REFINE__ */ +} +#endif /* __SUBSURFACE__ */ + + +/* Ray intersection. We simply compute the vertex positions at the given ray + * time and do a ray intersection with the resulting triangle. + */ + +ccl_device_inline bool motion_triangle_intersect( + KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 dir, + float time, + uint visibility, + int object, + int prim_addr) +{ + /* Primitive index for vertex location lookup. */ + int prim = kernel_tex_fetch(__prim_index, prim_addr); + int fobject = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + /* Get vertex locations for intersection. */ + float3 verts[3]; + motion_triangle_vertices(kg, fobject, prim, time, verts); + /* Ray-triangle intersection, unoptimized. */ + float t, u, v; + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) + { +#ifdef __VISIBILITY_FLAG__ + /* Visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags. + */ + if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) +#endif + { + isect->t = t; + isect->u = u; + isect->v = v; + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_MOTION_TRIANGLE; + return true; + } + } + return false; +} + +/* Special ray intersection routines for subsurface scattering. In that case we + * only want to intersect with primitives in the same object, and if case of + * multiple hits we pick a single random primitive as the intersection point. + */ +#ifdef __SUBSURFACE__ +ccl_device_inline void motion_triangle_intersect_subsurface( + KernelGlobals *kg, + SubsurfaceIntersection *ss_isect, + float3 P, + float3 dir, + float time, + int object, + int prim_addr, + float tmax, + uint *lcg_state, + int max_hits) +{ + /* Primitive index for vertex location lookup. */ + int prim = kernel_tex_fetch(__prim_index, prim_addr); + int fobject = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + /* Get vertex locations for intersection. */ + float3 verts[3]; + motion_triangle_vertices(kg, fobject, prim, time, verts); + /* Ray-triangle intersection, unoptimized. */ + float t, u, v; + if(ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) + { + for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { + if(ss_isect->hits[i].t == t) { + return; + } + } + ss_isect->num_hits++; + int hit; + if(ss_isect->num_hits <= max_hits) { + hit = ss_isect->num_hits - 1; + } + else { + /* Reservoir sampling: if we are at the maximum number of + * hits, randomly replace element or skip it. + */ + hit = lcg_step_uint(lcg_state) % ss_isect->num_hits; + + if(hit >= max_hits) + return; + } + /* Record intersection. */ + Intersection *isect = &ss_isect->hits[hit]; + isect->t = t; + isect->u = u; + isect->v = v; + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_MOTION_TRIANGLE; + /* Record geometric normal. */ + ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0], + verts[2] - verts[0])); + } +} +#endif /* __SUBSURFACE__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h new file mode 100644 index 00000000000..cb456056e20 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h @@ -0,0 +1,123 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Motion Triangle Primitive + * + * These are stored as regular triangles, plus extra positions and normals at + * times other than the frame center. Computing the triangle vertex positions + * or normals at a given ray time is a matter of interpolation of the two steps + * between which the ray time lies. + * + * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION + * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes. + */ + +CCL_NAMESPACE_BEGIN + +/* Setup of motion triangle specific parts of ShaderData, moved into this one + * function to more easily share computation of interpolated positions and + * normals */ + +/* return 3 triangle vertex normals */ +ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, + ShaderData *sd, const + Intersection *isect, + const Ray *ray, + bool subsurface) +{ + /* Get shader. */ + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + /* Get motion info. */ + /* TODO(sergey): This logic is really similar to motion_triangle_vertices(), + * can we de-duplicate something here? + */ + int numsteps, numverts; + object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); + /* Figure out which steps we need to fetch and their interpolation factor. */ + int maxstep = numsteps*2; + int step = min((int)(sd->time*maxstep), maxstep-1); + float t = sd->time*maxstep - step; + /* Find attribute. */ + AttributeElement elem; + int offset = find_attribute_motion(kg, sd->object, + ATTR_STD_MOTION_VERTEX_POSITION, + &elem); + kernel_assert(offset != ATTR_STD_NOT_FOUND); + /* Fetch vertex coordinates. */ + float3 verts[3], next_verts[3]; + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); + motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); + /* Interpolate between steps. */ + verts[0] = (1.0f - t)*verts[0] + t*next_verts[0]; + verts[1] = (1.0f - t)*verts[1] + t*next_verts[1]; + verts[2] = (1.0f - t)*verts[2] + t*next_verts[2]; + /* Compute refined position. */ +#ifdef __SUBSURFACE__ + if(subsurface) { + sd->P = motion_triangle_refine_subsurface(kg, + sd, + isect, + ray, + verts); + } + else +#endif /* __SUBSURFACE__*/ + { + sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); + } + /* Compute face normal. */ + float3 Ng; + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); + } + else { + Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); + } + sd->Ng = Ng; + sd->N = Ng; + /* Compute derivatives of P w.r.t. uv. */ +#ifdef __DPDU__ + sd->dPdu = (verts[0] - verts[2]); + sd->dPdv = (verts[1] - verts[2]); +#endif + /* Compute smooth normal. */ + if(sd->shader & SHADER_SMOOTH_NORMAL) { + /* Find attribute. */ + AttributeElement elem; + int offset = find_attribute_motion(kg, + sd->object, + ATTR_STD_MOTION_VERTEX_NORMAL, + &elem); + kernel_assert(offset != ATTR_STD_NOT_FOUND); + /* Fetch vertex coordinates. */ + float3 normals[3], next_normals[3]; + motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals); + motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals); + /* Interpolate between steps. */ + normals[0] = (1.0f - t)*normals[0] + t*next_normals[0]; + normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; + normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; + /* Interpolate between vertices. */ + float u = sd->u; + float v = sd->v; + float w = 1.0f - u - v; + sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 9f0fe032ba4..1ffc143be34 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -113,7 +113,6 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm) { int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_MOTION) { /* if we do motion blur */ Transform tfm = object_fetch_transform_motion(kg, object, time); @@ -138,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P); + *P = transform_point_auto(&sd->ob_tfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -150,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P); + *P = transform_point_auto(&sd->ob_itfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -162,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) { - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N)); + if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N)); } #else - if(ccl_fetch(sd, object) != OBJECT_NONE) { - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + if(sd->object != OBJECT_NONE) { + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); } #endif @@ -178,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N)); + *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N)); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -190,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D); + *D = transform_direction_auto(&sd->ob_tfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -202,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D); + *D = transform_direction_auto(&sd->ob_itfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -213,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, object) == OBJECT_NONE) + if(sd->object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); + return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -327,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1); + return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1); } /* Particle data from which object was instanced */ @@ -416,17 +415,18 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 return rcp(dir); -#else - return 1.0f / dir; -#endif } /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_push(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -436,8 +436,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -474,16 +477,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { - if(*t != FLT_MAX) { + if(t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - *t /= len(transform_direction(&tfm, ray->D)); + t /= len(transform_direction(&tfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ @@ -502,13 +513,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, +ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, - ccl_addr_space float *t, + float t, Transform *itfm) { object_fetch_transform_motion_test(kg, object, ray->time, itfm); @@ -519,8 +530,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -558,22 +572,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, /* Transorm ray to exit motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, - int object, - const Ray *ray, - float3 *P, - float3 *dir, - float3 *idir, - ccl_addr_space float *t, - Transform *itfm) -{ - if(*t != FLT_MAX) { - *t /= len(transform_direction(itfm, ray->D)); +ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t, + Transform *itfm) +{ + if(t != FLT_MAX) { + t /= len(transform_direction(itfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h index 6a0ff5a4a04..5663b598508 100644 --- a/intern/cycles/kernel/geom/geom_patch.h +++ b/intern/cycles/kernel/geom/geom_patch.h @@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float val = 0.0f; @@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); @@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index dbf0b804b5d..989f1574e94 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, const AttributeDescriptor desc, float *dx, float *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_VOLUME) { + else if(sd->type & PRIMITIVE_ALL_VOLUME) { return volume_attribute_float(kg, sd, desc, dx, dy); } #endif @@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float3(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float3(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_VOLUME) { + else if(sd->type & PRIMITIVE_ALL_VOLUME) { return volume_attribute_float3(kg, sd, desc, dx, dy); } #endif @@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) + if(sd->type & PRIMITIVE_ALL_CURVE) # ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); # else return make_float3(0.0f, 0.0f, 0.0f); # endif @@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); + return cross(sd->N, normalize(cross(data, sd->N))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -153,16 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * float3 center; #ifdef __HAIR__ - bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, ¢er); + } } else #endif - center = ccl_fetch(sd, P); + center = sd->P; float3 motion_pre = center, motion_post = center; @@ -172,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * if(desc.offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); + object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); /* lookup attributes */ motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL); - desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; + desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -192,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 647840dc696..044e82f03d4 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd) { - return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0; + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; } /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3]) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x); uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y); @@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float a, dads, dadt; a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt); @@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER) { float2 uv[3]; @@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = 0.0f; @@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float3 a, dads, dadt; @@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { float2 uv[3]; @@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 17538872ead..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -26,16 +26,18 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* return normal */ - if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { return normalize(cross(v2 - v0, v1 - v0)); - else + } + else { return normalize(cross(v1 - v0, v2 - v0)); + } } /* point and normal on triangle */ @@ -46,20 +48,18 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); - /* compute point */ float t = 1.0f - u - v; *P = (u*v0 + v*v1 + t*v2); - /* get object flags */ int object_flag = kernel_tex_fetch(__object_flag, object); - /* compute normal */ - if(object_flag & SD_NEGATIVE_SCALE_APPLIED) + if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { *Ng = normalize(cross(v2 - v0, v1 - v0)); - else + } + else { *Ng = normalize(cross(v1 - v0, v2 - v0)); - + } /* shader`*/ *shader = kernel_tex_fetch(__tri_shader, prim); } @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ @@ -110,34 +112,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y); float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = 0.0f; @@ -153,24 +155,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y)); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float3 f0, f1, f2; if(desc.element == ATTR_ELEMENT_CORNER) { @@ -185,11 +187,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index eb7340583c8..804e74d7e37 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -22,232 +22,50 @@ CCL_NAMESPACE_BEGIN -/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed - * component of float3 value. - */ -#ifndef __KERNEL_CPU__ -# define IDX(vec, idx) \ - ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) )) -#else -# define IDX(vec, idx) ((vec)[idx]) -#endif - -/* Ray-Triangle intersection for BVH traversal - * - * Sven Woop - * Watertight Ray/Triangle Intersection - * - * http://jcgt.org/published/0002/01/05/paper.pdf - */ - -/* Precalculated data for the ray->tri intersection. */ -typedef struct IsectPrecalc { - /* Maximal dimension kz, and orthogonal dimensions. */ - int kx, ky, kz; - - /* Shear constants. */ - float Sx, Sy, Sz; -} IsectPrecalc; - -#if (defined(__KERNEL_OPENCL_APPLE__)) || \ - (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))) -ccl_device_noinline -#else -ccl_device_inline -#endif -void triangle_intersect_precalc(float3 dir, - IsectPrecalc *isect_precalc) -{ - /* Calculate dimension where the ray direction is maximal. */ -#ifndef __KERNEL_SSE__ - int kz = util_max_axis(make_float3(fabsf(dir.x), - fabsf(dir.y), - fabsf(dir.z))); - int kx = kz + 1; if(kx == 3) kx = 0; - int ky = kx + 1; if(ky == 3) ky = 0; -#else - int kx, ky, kz; - /* Avoiding mispredicted branch on direction. */ - kz = util_max_axis(fabs(dir)); - static const char inc_xaxis[] = {1, 2, 0, 55}; - static const char inc_yaxis[] = {2, 0, 1, 55}; - kx = inc_xaxis[kz]; - ky = inc_yaxis[kz]; -#endif - - float dir_kz = IDX(dir, kz); - - /* Swap kx and ky dimensions to preserve winding direction of triangles. */ - if(dir_kz < 0.0f) { - int tmp = kx; - kx = ky; - ky = tmp; - } - - /* Calculate the shear constants. */ - float inv_dir_z = 1.0f / dir_kz; - isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; - isect_precalc->Sz = inv_dir_z; - - /* Store the dimensions. */ - isect_precalc->kx = kx; - isect_precalc->ky = ky; - isect_precalc->kz = kz; -} - -/* TODO(sergey): Make it general utility function. */ -ccl_device_inline float xor_signmask(float x, int y) -{ - return __int_as_float(__float_as_int(x) ^ y); -} - ccl_device_inline bool triangle_intersect(KernelGlobals *kg, - const IsectPrecalc *isect_precalc, Intersection *isect, float3 P, + float3 dir, uint visibility, int object, - int triAddr) + int prim_addr) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permuteMask); - const avxf BC_k = shuffle(BC, permuteMask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return false; - } + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return false; - } #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return false; - } - - /* Calculate scaled z-coordinates of vertices and use them to calculate - * the hit distance. - */ - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const int sign_det = (__float_as_int(det) & 0x80000000); - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > isect->t * xor_signmask(det, sign_det))) + float t, u, v; + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + float4_to_float3(tri_a), + float4_to_float3(tri_b), + float4_to_float3(tri_c), +#endif + &u, &v, &t)) { - return false; - } - #ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) + /* Visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags. + */ + if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) #endif - { -#ifdef __KERNEL_CUDA__ - if(A == B && B == C) { - return false; + { + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_TRIANGLE; + isect->u = u; + isect->v = v; + isect->t = t; + return true; } -#endif - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - isect->prim = triAddr; - isect->object = object; - isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; - isect->t = T * inv_det; - return true; } return false; } @@ -260,138 +78,37 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, #ifdef __SUBSURFACE__ ccl_device_inline void triangle_intersect_subsurface( KernelGlobals *kg, - const IsectPrecalc *isect_precalc, SubsurfaceIntersection *ss_isect, float3 P, + float3 dir, int object, - int triAddr, + int prim_addr, float tmax, uint *lcg_state, int max_hits) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); - const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), - tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), - tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permuteMask); - const avxf BC_k = shuffle(BC, permuteMask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return; - } + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return; - } + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return; - } - - /* Calculate scaled z−coordinates of vertices and use them to calculate - * the hit distance. - */ - const int sign_det = (__float_as_int(det) & 0x80000000); - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > tmax * xor_signmask(det, sign_det))) + float t, u, v; + if(!ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + tri_a, tri_b, tri_c, +#endif + &u, &v, &t)) { return; } - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - - const float t = T * inv_det; for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { if(ss_isect->hits[i].t == t) { return; @@ -415,21 +132,22 @@ ccl_device_inline void triangle_intersect_subsurface( /* record intersection */ Intersection *isect = &ss_isect->hits[hit]; - isect->prim = triAddr; + isect->prim = prim_addr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; + isect->u = u; + isect->v = v; isect->t = t; /* Record geometric normal. */ - /* TODO(sergey): Use float4_to_float3() on just an edges. */ - const float3 v0 = float4_to_float3(tri_a); - const float3 v1 = float4_to_float3(tri_b); - const float3 v2 = float4_to_float3(tri_c); - ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0)); -} +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif + ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); +} +#endif /* __SUBSURFACE__ */ /* Refine triangle intersection to more precise hit point. For rays that travel * far the precision is often not so good, this reintersects the primitive from @@ -457,7 +175,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); # endif @@ -491,7 +209,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); # endif @@ -519,7 +237,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -557,7 +275,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -570,6 +288,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, return P; } -#undef IDX - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 28ea80f1a65..9a5b94c1f46 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -33,21 +33,6 @@ CCL_NAMESPACE_BEGIN /* Return position normalized to 0..1 in mesh bounds */ -#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 -ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) -{ - float4 r; - switch(id) { - case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break; - case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break; - case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break; - case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break; - case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break; - } - return r; -} -#endif /* __KERNEL_CUDA__ */ - ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P) @@ -68,39 +53,14 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) { - float r; - -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - r = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z); -# else - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - r = average(float4_to_float3(volume_image_texture_3d(desc.offset, P.x, P.y, P.z))); -# endif -#elif defined(__KERNEL_OPENCL__) - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - r = average(float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z))); -#else - -#if 1 /* XXX WITH_OPENVDB ? */ - float3 P = ccl_fetch(sd, P); - /* XXX OpenVDB does not support cubic interpolation (could use quadratic though) - lukas_t */ -#if 0 - if(sd->flag & SD_VOLUME_CUBIC) - r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, ...) - else -#endif - r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX); +#ifdef __OPENVDB__ + float3 P = sd->P; + /* XXX OpenVDB does not support cubic interpolation - lukas_t */ + float r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX); #else - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - if(sd->flag & SD_VOLUME_CUBIC) - r = average(float4_to_float3(kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC))); - else - r = average(float4_to_float3(kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z))); -#endif - + float3 P = volume_normalized_position(kg, sd, sd->P); + InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE; + float r = average(float4_to_float3(kernel_tex_image_interp_3d_float(kg, desc.offset, P.x, P.y, P.z, interp))); #endif if(dx) *dx = 0.0f; @@ -111,33 +71,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - float3 r; - -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - r = float4_to_float3(kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z)); -# else - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - r = float4_to_float3(volume_image_texture_3d(desc.offset, P.x, P.y, P.z)); -# endif -#elif defined(__KERNEL_OPENCL__) - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - r = float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z)); +#ifdef __OPENVDB__ + float3 P = sd->P; + /* XXX OpenVDB does not support cubic interpolation - lukas_t */ + float3 r = kernel_tex_voxel_float3(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX); #else - -#if 1 /* XXX WITH_OPENVDB ? */ - float3 P = ccl_fetch(sd, P); - r = kernel_tex_voxel_float3(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_POINT); -#else - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); - if(sd->flag & SD_VOLUME_CUBIC) - r = float4_to_float3(kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC)); - else - r = float4_to_float3(kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z)); -#endif - + float3 P = volume_normalized_position(kg, sd, sd->P); + InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE; + float3 r = float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp)); #endif if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 9279a94c13a..84a988f1dbc 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -19,7 +19,8 @@ /* CPU Kernel Interface */ -#include "util_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct KernelGlobals; +struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); @@ -46,32 +48,22 @@ void kernel_tex_copy(KernelGlobals *kg, ExtensionType extension = EXTENSION_REPEAT); #define KERNEL_ARCH cpu -#include "kernels/cpu/kernel_cpu.h" - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 6c3ee6b8098..366f25422fd 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN * BSDF evaluation result, split per BSDF type. This is used to accumulate * render passes separately. */ +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, + const ShaderData *sd); + ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass) { #ifdef __PASSES__ @@ -52,10 +55,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v { eval->diffuse = value; } +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f); +#endif } -ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis += value; +#endif + value *= mis_weight; #ifdef __PASSES__ if(eval->use_light_pass) { if(CLOSURE_IS_BSDF_DIFFUSE(type)) @@ -96,7 +106,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) } } -ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -115,8 +125,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) } } +ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +{ +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif + bsdf_eval_mis(eval, value); +} + ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif #ifdef __PASSES__ if(eval->use_light_pass) { eval->diffuse *= value; @@ -134,7 +155,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) #endif } -ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval) +ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -160,7 +181,6 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) if(use_light_pass) { L->indirect = make_float3(0.0f, 0.0f, 0.0f); - L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f); @@ -181,45 +201,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); - + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); L->ao = make_float3(0.0f, 0.0f, 0.0f); L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f); L->mist = 0.0f; + + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.direct = make_float3(0.0f, 0.0f, 0.0f); } else #endif { + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); } + +#ifdef __SHADOW_TRICKS__ + L->path_total = make_float3(0.0f, 0.0f, 0.0f); + L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; + L->shadow_transparency = 1.0f; + L->has_shadow_catcher = 0; +#endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif + +#ifdef __KERNEL_DEBUG__ + L->debug_data.num_bvh_traversed_nodes = 0; + L->debug_data.num_bvh_traversed_instances = 0; + L->debug_data.num_bvh_intersections = 0; + L->debug_data.num_ray_bounces = 0; +#endif } -ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, - BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label) +ccl_device_inline void path_radiance_bsdf_bounce( + KernelGlobals *kg, + PathRadianceState *L_state, + ccl_addr_space float3 *throughput, + BsdfEval *bsdf_eval, + float bsdf_pdf, int bounce, int bsdf_label) { float inverse_pdf = 1.0f/bsdf_pdf; #ifdef __PASSES__ - if(L->use_light_pass) { + if(kernel_data.film.use_light_pass) { if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) { /* first on directly visible surface */ float3 value = *throughput*inverse_pdf; - L->path_diffuse = bsdf_eval->diffuse*value; - L->path_glossy = bsdf_eval->glossy*value; - L->path_transmission = bsdf_eval->transmission*value; - L->path_subsurface = bsdf_eval->subsurface*value; - L->path_scatter = bsdf_eval->scatter*value; - - *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter; + L_state->diffuse = bsdf_eval->diffuse*value; + L_state->glossy = bsdf_eval->glossy*value; + L_state->transmission = bsdf_eval->transmission*value; + L_state->subsurface = bsdf_eval->subsurface*value; + L_state->scatter = bsdf_eval->scatter*value; + + *throughput = L_state->diffuse + + L_state->glossy + + L_state->transmission + + L_state->subsurface + + L_state->scatter; - L->direct_throughput = *throughput; + L_state->direct = *throughput; } else { /* transparent bounce before first hit, or indirectly visible through BSDF */ @@ -234,13 +287,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space } } -ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->bounce == 0) L->emission += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -252,11 +314,28 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } } -ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce) +ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 alpha, + float3 bsdf, + float3 ao) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -273,11 +352,47 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput } } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_total_ao( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 bsdf) +{ +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } +#else + (void) L; + (void) state; + (void) throughput; + (void) bsdf; +#endif +} + +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -303,13 +418,47 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } } -ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_total_light( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + const BsdfEval *bsdf_eval) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } +#else + (void) L; + (void) state; + (void) throughput; + (void) bsdf_eval; +#endif +} + +ccl_device_inline void path_radiance_accum_background( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) +{ + +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + L->path_total_shaded += throughput * value * L->shadow_transparency; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->bounce == 0) L->background += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -319,7 +468,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th { L->emission += throughput*value; } + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ +} + +ccl_device_inline void path_radiance_accum_transparent( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput) +{ + L->transparent += average(throughput); +} + +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_accum_shadowcatcher( + PathRadiance *L, + float3 throughput, + float3 background) +{ + L->shadow_throughput += average(throughput); + L->shadow_background_color += throughput * background; + L->has_shadow_catcher = 1; } +#endif ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) { @@ -328,19 +501,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) * only a single throughput further along the path, here we recover just * the indirect path that is not influenced by any particular BSDF type */ if(L->use_light_pass) { - L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput); - L->direct_diffuse += L->path_diffuse*L->direct_emission; - L->direct_glossy += L->path_glossy*L->direct_emission; - L->direct_transmission += L->path_transmission*L->direct_emission; - L->direct_subsurface += L->path_subsurface*L->direct_emission; - L->direct_scatter += L->path_scatter*L->direct_emission; - - L->indirect = safe_divide_color(L->indirect, L->direct_throughput); - L->indirect_diffuse += L->path_diffuse*L->indirect; - L->indirect_glossy += L->path_glossy*L->indirect; - L->indirect_transmission += L->path_transmission*L->indirect; - L->indirect_subsurface += L->path_subsurface*L->indirect; - L->indirect_scatter += L->path_scatter*L->indirect; + L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct); + L->direct_diffuse += L->state.diffuse*L->direct_emission; + L->direct_glossy += L->state.glossy*L->direct_emission; + L->direct_transmission += L->state.transmission*L->direct_emission; + L->direct_subsurface += L->state.subsurface*L->direct_emission; + L->direct_scatter += L->state.scatter*L->direct_emission; + + L->indirect = safe_divide_color(L->indirect, L->state.direct); + L->indirect_diffuse += L->state.diffuse*L->indirect; + L->indirect_glossy += L->state.glossy*L->indirect; + L->indirect_transmission += L->state.transmission*L->indirect; + L->indirect_subsurface += L->state.subsurface*L->indirect; + L->indirect_scatter += L->state.scatter*L->indirect; } #endif } @@ -349,11 +522,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->indirect = make_float3(0.0f, 0.0f, 0.0f); @@ -366,11 +539,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = L_src->path_diffuse; - L->path_glossy = L_src->path_glossy; - L->path_transmission = L_src->path_transmission; - L->path_subsurface = L_src->path_subsurface; - L->path_scatter = L_src->path_scatter; + L->state = L_src->state; L->direct_emission = L_src->direct_emission; L->indirect = L_src->indirect; @@ -378,7 +547,40 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, #endif } -ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L) +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg, + PathRadiance *L, + float3 *L_sum, + float *alpha) +{ + /* Calculate current shadow of the path. */ + float path_total = average(L->path_total); + float shadow; + + if(UNLIKELY(!isfinite_safe(path_total))) { + kernel_assert(!"Non-finite total radiance along the path"); + shadow = 0.0f; + } + else if(path_total == 0.0f) { + shadow = L->shadow_transparency; + } + else { + float path_total_shaded = average(L->path_total_shaded); + shadow = path_total_shaded / path_total; + } + + /* Calculate final light sum and transparency for shadow catcher object. */ + if(kernel_data.background.transparent) { + *alpha -= L->shadow_throughput * shadow; + } + else { + L->shadow_background_color *= shadow; + *L_sum += L->shadow_background_color; + } +} +#endif + +ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha) { float3 L_sum; /* Light Passes are used */ @@ -399,7 +601,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); /* Reject invalid value */ - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -455,8 +657,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L_sum = L_direct + L_indirect; } #endif - - return L_sum; } /* No Light Passes */ @@ -464,42 +664,105 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi #endif { L_sum = L->emission; + + /* Reject invalid value */ + float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); + if(!isfinite_safe(sum)) { + kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); + L_sum = make_float3(0.0f, 0.0f, 0.0f); + } } - /* Reject invalid value */ - float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if(!isfinite(sum)) { - kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); - L_sum = make_float3(0.0f, 0.0f, 0.0f); + /* Compute alpha. */ + *alpha = 1.0f - L->transparent; + + /* Add shadow catcher contributions. */ +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha); } +#endif /* __SHADOW_TRICKS__ */ return L_sum; } -ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + *noisy += L->shadow_background_color; + } +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + +ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample) { - float fac = 1.0f/num_samples; +#ifdef __SPLIT_KERNEL__ +# define safe_float3_add(f, v) \ + do { \ + ccl_global float *p = (ccl_global float*)(&(f)); \ + atomic_add_and_fetch_float(p+0, (v).x); \ + atomic_add_and_fetch_float(p+1, (v).y); \ + atomic_add_and_fetch_float(p+2, (v).z); \ + } while(0) +# define safe_float_add(f, v) \ + atomic_add_and_fetch_float(&(f), (v)) +#else +# define safe_float3_add(f, v) (f) += (v) +# define safe_float_add(f, v) (f) += (v) +#endif /* __SPLIT_KERNEL__ */ #ifdef __PASSES__ - L->direct_diffuse += L_sample->direct_diffuse*fac; - L->direct_glossy += L_sample->direct_glossy*fac; - L->direct_transmission += L_sample->direct_transmission*fac; - L->direct_subsurface += L_sample->direct_subsurface*fac; - L->direct_scatter += L_sample->direct_scatter*fac; - - L->indirect_diffuse += L_sample->indirect_diffuse*fac; - L->indirect_glossy += L_sample->indirect_glossy*fac; - L->indirect_transmission += L_sample->indirect_transmission*fac; - L->indirect_subsurface += L_sample->indirect_subsurface*fac; - L->indirect_scatter += L_sample->indirect_scatter*fac; - - L->background += L_sample->background*fac; - L->ao += L_sample->ao*fac; - L->shadow += L_sample->shadow*fac; - L->mist += L_sample->mist*fac; -#endif - L->emission += L_sample->emission * fac; + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter); + + safe_float3_add(L->background, L_sample->background); + safe_float3_add(L->ao, L_sample->ao); + safe_float3_add(L->shadow, L_sample->shadow); + safe_float_add(L->mist, L_sample->mist); +#endif /* __PASSES__ */ + safe_float3_add(L->emission, L_sample->emission); + +#undef safe_float_add +#undef safe_float3_add } CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index c32ac6ccf41..84d8d84d486 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, - RNG rng, + uint rng_hash, int pass_filter, int sample) { @@ -48,13 +48,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_radiance_init(&L_sample, kernel_data.film.use_light_pass); /* init path state */ - path_state_init(kg, &emission_sd, &state, &rng, sample, NULL); + path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, &state, state.flag); - /* TODO, disable the closures we won't need */ + /* TODO, disable more closures we don't need besides transparent */ + shader_bsdf_disable_transparency(kg, sd); #ifdef __BRANCHED_PATH__ if(!kernel_data.integrator.branched) { @@ -63,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd)); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } bool is_sss_sample = false; @@ -85,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, &emission_sd, &L_sample, &state, - &rng, &ray, &throughput, &ss_indirect)) @@ -100,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - &rng, &ray, throughput, - state.num_samples, &state, &L_sample); - kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample); } is_sss_sample = true; } @@ -115,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample light and BSDF */ if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) { - kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample); + kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample); - if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); + kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); @@ -136,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput); + kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } #ifdef __SUBSURFACE__ @@ -150,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd, - &emission_sd, &L_sample, &state, &rng, &ray, throughput); + &emission_sd, &L_sample, &state, &ray, throughput); } #endif @@ -160,20 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* direct light */ if(kernel_data.integrator.use_direct_light) { int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, &rng, + kernel_branched_path_surface_connect_light(kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all); } #endif /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, &rng, + kernel_branched_path_surface_indirect_light(kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample); } } #endif /* accumulate into master L */ - path_radiance_accum_sample(L, &L_sample, 1); + path_radiance_accum_sample(L, &L_sample); } ccl_device bool is_aa_pass(ShaderEvalType type) @@ -224,7 +220,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg, ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, ShaderData *sd, - RNG *rng, PathState *state, float3 direct, float3 indirect, @@ -244,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -291,14 +286,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, int num_samples = kernel_data.integrator.aa_samples; /* random number generator */ - RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed); + uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed); float filter_x, filter_y; if(sample == 0) { filter_x = filter_y = 0.5f; } else { - path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); + path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); } /* subpixel u/v offset */ @@ -320,7 +315,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, P, Ng, Ng, shader, object, prim, u, v, 1.0f, 0.5f, - !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED), + !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED), LAMP_NONE); sd.I = sd.N; @@ -334,18 +329,20 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* light passes if we need more than color */ if(pass_filter & ~BAKE_FILTER_COLOR) - compute_light_pass(kg, &sd, &L, rng, pass_filter, sample); + compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample); switch(type) { /* data passes */ case SHADER_EVAL_NORMAL: { + float3 N = sd.N; if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, 0); + N = shader_bsdf_average_normal(kg, &sd); } - /* compression: normal = (2 * color) - 1 */ - out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); + /* encoding: normal = (2 * color) - 1 */ + out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); break; } case SHADER_EVAL_UV: @@ -355,7 +352,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } case SHADER_EVAL_EMISSION: { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, &sd, &state, 0); out = shader_emissive_eval(kg, &sd); break; } @@ -370,7 +367,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, case SHADER_EVAL_COMBINED: { if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) { - out = path_radiance_clamp_and_sum(kg, &L); + float alpha; + out = path_radiance_clamp_and_sum(kg, &L, &alpha); break; } @@ -408,7 +406,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_diffuse, L.indirect_diffuse, @@ -420,7 +417,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_glossy, L.indirect_glossy, @@ -432,7 +428,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_transmission, L.indirect_transmission, @@ -445,7 +440,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #ifdef __SUBSURFACE__ out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_subsurface, L.indirect_subsurface, @@ -479,7 +473,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); break; } default: @@ -499,78 +493,69 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #endif /* __BAKING__ */ -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, - ccl_global uint4 *input, - ccl_global float4 *output, - ccl_global float *output_luma, - ShaderEvalType type, - int i, - int sample) +ccl_device void kernel_displace_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) { ShaderData sd; PathState state = {0}; uint4 in = input[i]; - float3 out; - if(type == SHADER_EVAL_DISPLACE) { - /* setup shader data */ - int object = in.x; - int prim = in.y; - float u = __uint_as_float(in.z); - float v = __uint_as_float(in.w); + /* setup shader data */ + int object = in.x; + int prim = in.y; + float u = __uint_as_float(in.z); + float v = __uint_as_float(in.w); - shader_setup_from_displace(kg, &sd, object, prim, u, v); + shader_setup_from_displace(kg, &sd, object, prim, u, v); - /* evaluate */ - float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN); - out = sd.P - P; + /* evaluate */ + float3 P = sd.P; + shader_eval_displacement(kg, &sd, &state); + float3 D = sd.P - P; - object_inverse_dir_transform(kg, &sd, &out); - } - else { // SHADER_EVAL_BACKGROUND - /* setup ray */ - Ray ray; - float u = __uint_as_float(in.x); - float v = __uint_as_float(in.y); - - ray.P = make_float3(0.0f, 0.0f, 0.0f); - ray.D = equirectangular_to_direction(u, v); - ray.t = 0.0f; + object_inverse_dir_transform(kg, &sd, &D); + + /* write output */ + output[i] += make_float4(D.x, D.y, D.z, 0.0f); +} + +ccl_device void kernel_background_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) +{ + ShaderData sd; + PathState state = {0}; + uint4 in = input[i]; + + /* setup ray */ + Ray ray; + float u = __uint_as_float(in.x); + float v = __uint_as_float(in.y); + + ray.P = make_float3(0.0f, 0.0f, 0.0f); + ray.D = equirectangular_to_direction(u, v); + ray.t = 0.0f; #ifdef __CAMERA_MOTION__ - ray.time = 0.5f; + ray.time = 0.5f; #endif #ifdef __RAY_DIFFERENTIALS__ - ray.dD = differential3_zero(); - ray.dP = differential3_zero(); + ray.dD = differential3_zero(); + ray.dP = differential3_zero(); #endif - /* setup shader data */ - shader_setup_from_background(kg, &sd, &ray); + /* setup shader data */ + shader_setup_from_background(kg, &sd, &ray); + + /* evaluate */ + int flag = 0; /* we can't know which type of BSDF this is for */ + float3 color = shader_eval_background(kg, &sd, &state, flag); - /* evaluate */ - int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); - } - /* write output */ - if(sample == 0) { - if(output != NULL) { - output[i] = make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] = average(out); - } - } - else { - if(output != NULL) { - output[i] += make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] += average(out); - } - } + output[i] += make_float4(color.x, color.y, color.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index dedac6b1465..0df5217d97a 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); Transform tfm = kernel_data.cam.worldtondc; @@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index dfcfcba2a40..4b43209e4aa 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -35,15 +35,24 @@ # define __NODES_FEATURES__ NODE_FEATURE_ALL #endif -#include "util_debug.h" -#include "util_math.h" -#include "util_simd.h" -#include "util_half.h" -#include "util_types.h" -#include "util_texture.h" +#include "util/util_debug.h" +#include "util/util_math.h" +#include "util/util_simd.h" +#include "util/util_half.h" +#include "util/util_types.h" +#include "util/util_texture.h" #define ccl_addr_space +#define ccl_local_id(d) 0 +#define ccl_global_id(d) (kg->global_id[d]) + +#define ccl_local_size(d) 1 +#define ccl_global_size(d) (kg->global_size[d]) + +#define ccl_group_id(d) ccl_global_id(d) +#define ccl_num_groups(d) ccl_global_size(d) + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ @@ -65,7 +74,7 @@ CCL_NAMESPACE_BEGIN * pointer lookup. */ template<typename T> struct texture { - ccl_always_inline T fetch(int index) + ccl_always_inline const T& fetch(int index) { kernel_assert(index >= 0 && index < width); return data[index]; @@ -78,9 +87,9 @@ template<typename T> struct texture { ccl_always_inline avxf fetch_avxf(const int index) { kernel_assert(index >= 0 && (index+1) < width); - ssef *ssefData = (ssef*)data; - ssef *ssefNodeData = &ssefData[index]; - return _mm256_loadu_ps((float *)ssefNodeData); + ssef *ssef_data = (ssef*)data; + ssef *ssef_node_data = &ssef_data[index]; + return _mm256_loadu_ps((float *)ssef_node_data); } #endif @@ -103,420 +112,6 @@ template<typename T> struct texture { int width; }; -template<typename T> struct texture_image { -#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ - u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ - u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void)0 - - ccl_always_inline float4 read(float4 r) - { - return r; - } - - ccl_always_inline float4 read(uchar4 r) - { - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - - ccl_always_inline float4 read(uchar r) - { - float f = r*(1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline float4 read(float r) - { - /* TODO(dingto): Optimize this, so interpolation - * happens on float instead of float4 */ - return make_float4(r, r, r, 1.0f); - } - - ccl_always_inline float4 read(half4 r) - { - return half4_to_float4(r); - } - - ccl_always_inline float4 read(half r) - { - float f = half_to_float(r); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline int wrap_periodic(int x, int width) - { - x %= width; - if(x < 0) - x += width; - return x; - } - - ccl_always_inline int wrap_clamp(int x, int width) - { - return clamp(x, 0, width-1); - } - - ccl_always_inline float frac(float x, int *ix) - { - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; - } - - ccl_always_inline float4 interp(float x, float y) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - int ix, iy, nix, niy; - - if(interpolation == INTERPOLATION_CLOSEST) { - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return read(data[ix + iy*width]); - } - else if(interpolation == INTERPOLATION_LINEAR) { - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); - r += (1.0f - ty)*tx*read(data[nix + iy*width]); - r += ty*(1.0f - tx)*read(data[ix + niy*width]); - r += ty*tx*read(data[nix + niy*width]); - - return r; - } - else { - /* Bicubic b-spline interpolation. */ - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - int pix, piy, nnix, nniy; - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - float u[4], v[4]; - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y) (read(data[xc[x] + yc[y]])) -#define TERM(col) \ - (v[col] * (u[0] * DATA(0, col) + \ - u[1] * DATA(1, col) + \ - u[2] * DATA(2, col) + \ - u[3] * DATA(3, col))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - - /* Actual interpolation. */ - return TERM(0) + TERM(1) + TERM(2) + TERM(3); - -#undef TERM -#undef DATA - } - } - - ccl_always_inline float4 interp_3d(float x, float y, float z) - { - return interp_3d_ex(x, y, z, interpolation); - } - - ccl_always_inline float4 interp_3d_ex(float x, float y, float z, - int interpolation = INTERPOLATION_LINEAR) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - int ix, iy, iz, nix, niy, niz; - - if(interpolation == INTERPOLATION_CLOSEST) { - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - frac(z*(float)depth, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - return read(data[ix + iy*width + iz*width*height]); - } - else if(interpolation == INTERPOLATION_LINEAR) { - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - float tz = frac(z*(float)depth - 0.5f, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r; - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); - r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); - r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); - r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); - - r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); - r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); - r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); - r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); - - return r; - } - else { - /* Tricubic b-spline interpolation. */ - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - const float tz = frac(z*(float)depth - 0.5f, &iz); - int pix, piy, piz, nnix, nniy, nniz; - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - piz = wrap_periodic(iz-1, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - nniz = wrap_periodic(iz+2, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - piz = wrap_clamp(iz-1, depth); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - nniz = wrap_clamp(iz+2, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - const int zc[4] = {width * height * piz, - width * height * iz, - width * height * niz, - width * height * nniz}; - float u[4], v[4], w[4]; - - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) -#define COL_TERM(col, row) \ - (v[col] * (u[0] * DATA(0, col, row) + \ - u[1] * DATA(1, col, row) + \ - u[2] * DATA(2, col, row) + \ - u[3] * DATA(3, col, row))) -#define ROW_TERM(row) \ - (w[row] * (COL_TERM(0, row) + \ - COL_TERM(1, row) + \ - COL_TERM(2, row) + \ - COL_TERM(3, row))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); - - /* Actual interpolation. */ - return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); - -#undef COL_TERM -#undef ROW_TERM -#undef DATA - } - } - - ccl_always_inline void dimensions_set(int width_, int height_, int depth_) - { - width = width_; - height = height_; - depth = depth_; - } - - T *data; - int interpolation; - ExtensionType extension; - int width, height, depth; -#undef SET_CUBIC_SPLINE_WEIGHTS -}; - -typedef texture<float4> texture_float4; -typedef texture<float2> texture_float2; -typedef texture<float> texture_float; -typedef texture<uint> texture_uint; -typedef texture<int> texture_int; -typedef texture<uint4> texture_uint4; -typedef texture<uchar4> texture_uchar4; -typedef texture<uchar> texture_uchar; -typedef texture_image<float> texture_image_float; -typedef texture_image<uchar> texture_image_uchar; -typedef texture_image<half> texture_image_half; -typedef texture_image<float4> texture_image_float4; -typedef texture_image<uchar4> texture_image_uchar4; -typedef texture_image<half4> texture_image_half4; - /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) @@ -524,9 +119,6 @@ typedef texture_image<half4> texture_image_half4; #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) -#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y) -#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z) -#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation) #define kernel_tex_voxel_float(tex, x, y, z, sampling) (vdb_volume_sample_scalar(kg->vdb, kg->vdb_tdata, tex, x, y, z, sampling)) #define kernel_tex_voxel_float3(tex, x, y, z, sampling) (vdb_volume_sample_vector(kg->vdb, kg->vdb_tdata, tex, x, y, z, sampling)) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..fa512f80e41 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -33,71 +33,109 @@ #include <cuda.h> #include <cuda_fp16.h> #include <float.h> +#include <stdint.h> /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ +#if __CUDA_ARCH__ < 300 +# define ccl_device_inline __device__ __inline__ # define ccl_device_forceinline __device__ __forceinline__ -#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) +#elif __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global -#define ccl_constant +#define ccl_static_constant __constant__ +#define ccl_constant const +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) /* Types */ -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" + +/* Work item functions */ + +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} /* Textures */ -typedef texture<float4, 1> texture_float4; -typedef texture<float2, 1> texture_float2; -typedef texture<float, 1> texture_float; -typedef texture<uint, 1> texture_uint; -typedef texture<int, 1> texture_int; -typedef texture<uint4, 1> texture_uint4; -typedef texture<uchar, 1> texture_uchar; -typedef texture<uchar4, 1> texture_uchar4; +/* Use arrays for regular data. This is a little slower than textures on Fermi, + * but allows for cleaner code and we will stop supporting Fermi soon. */ +#define kernel_tex_fetch(t, index) t[(index)] + +/* On Kepler (6xx) and above, we use Bindless Textures for images. + * On Fermi cards (4xx and 5xx), we have to use regular textures. */ +#if __CUDA_ARCH__ < 300 typedef texture<float4, 2> texture_image_float4; typedef texture<float4, 3> texture_image3d_float4; typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; - -/* Macros to handle different memory storage on different devices */ - -/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. - * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. - * - * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. - * Using Arrays on Fermi turned out to be slower.*/ - -/* Fermi */ -#if __CUDA_ARCH__ < 300 -# define __KERNEL_CUDA_TEX_STORAGE__ -# define kernel_tex_fetch(t, index) tex1Dfetch(t, index) - -# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) - -/* Kepler */ -#else -# define kernel_tex_fetch(t, index) t[(index)] - -# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) -# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) -# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) -# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) #endif #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index f076e3a7d37..b02e3bc576d 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -36,11 +36,14 @@ #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device ccl_noinline #define ccl_may_alias +#define ccl_static_constant static __constant #define ccl_constant __constant #define ccl_global __global #define ccl_local __local +#define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict +#define ccl_ref #define ccl_align(n) __attribute__((aligned(n))) #ifdef __SPLIT_KERNEL__ @@ -49,6 +52,17 @@ # define ccl_addr_space #endif +#define ATTR_FALLTHROUGH + +#define ccl_local_id(d) get_local_id(d) +#define ccl_global_id(d) get_global_id(d) + +#define ccl_local_size(d) get_local_size(d) +#define ccl_global_size(d) get_global_size(d) + +#define ccl_group_id(d) get_group_id(d) +#define ccl_num_groups(d) get_num_groups(d) + /* Selective nodes compilation. */ #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -117,6 +131,7 @@ # define expf(x) native_exp(((float)(x))) # define sqrtf(x) native_sqrt(((float)(x))) # define logf(x) native_log(((float)(x))) +# define rcp(x) native_recip(x) #else # define sinf(x) sin(((float)(x))) # define cosf(x) cos(((float)(x))) @@ -124,17 +139,18 @@ # define expf(x) exp(((float)(x))) # define sqrtf(x) sqrt(((float)(x))) # define logf(x) log(((float)(x))) +# define rcp(x) recip(x)) #endif /* data lookup defines */ #define kernel_data (*kg->data) -#define kernel_tex_fetch(t, index) kg->t[index] +#define kernel_tex_fetch(tex, index) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))[(index)] /* define NULL */ #define NULL 0 -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" #endif /* __KERNEL_COMPAT_OPENCL_H__ */ diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h deleted file mode 100644 index 24d6458567e..00000000000 --- a/intern/cycles/kernel/kernel_debug.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2011-2014 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void debug_data_init(DebugData *debug_data) -{ - debug_data->num_bvh_traversal_steps = 0; - debug_data->num_bvh_traversed_instances = 0; - debug_data->num_ray_bounces = 0; -} - -ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, - ccl_global float *buffer, - ccl_addr_space PathState *state, - DebugData *debug_data, - int sample) -{ - int flag = kernel_data.film.pass_flag; - if(flag & PASS_BVH_TRAVERSAL_STEPS) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps, - sample, - debug_data->num_bvh_traversal_steps); - } - if(flag & PASS_BVH_TRAVERSED_INSTANCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, - sample, - debug_data->num_bvh_traversed_instances); - } - if(flag & PASS_RAY_BOUNCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, - sample, - debug_data->num_ray_bounces); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 8c7c651a053..45b8c6311e1 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -37,16 +37,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ray.D = ls->D; ray.P = ls->P; ray.t = 1.0f; -# ifdef __OBJECT_MOTION__ ray.time = time; -# endif ray.dP = differential3_zero(); ray.dD = dI; shader_setup_from_background(kg, emission_sd, &ray); path_state_modify_bounce(state, true); - eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION); + eval = shader_eval_background(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); } else @@ -67,16 +65,16 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, false, ls->lamp); - ls->Ng = ccl_fetch(emission_sd, Ng); + ls->Ng = emission_sd->Ng; /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); /* evaluate emissive closure */ - if(ccl_fetch(emission_sd, flag) & SD_EMISSION) + if(emission_sd->flag & SD_EMISSION) eval = shader_emissive_eval(kg, emission_sd); else eval = make_float3(0.0f, 0.0f, 0.0f); @@ -112,7 +110,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, -ls->D, dD, ls->t, - ccl_fetch(sd, time)); + sd->time); if(is_zero(light_eval)) return false; @@ -120,7 +118,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, /* evaluate BSDF at shading point */ #ifdef __VOLUME__ - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); else { float bsdf_pdf; @@ -156,8 +154,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(bsdf_eval_is_zero(eval)) return false; - if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold; + if(kernel_data.integrator.light_inv_rr_threshold > 0.0f +#ifdef __SHADOW_TRICKS__ + && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0 +#endif + ) + { + float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold; if(probability < 1.0f) { if(rand_terminate >= probability) { return false; @@ -168,8 +171,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); - ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); + ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); if(ls->t == FLT_MAX) { /* distant light */ @@ -182,7 +185,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = differential3_zero(); } else { @@ -204,14 +207,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); + float pdf = triangle_light_pdf(kg, sd, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -314,7 +317,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, # endif path_state_modify_bounce(state, true); - float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION); + float3 L = shader_eval_background(kg, emission_sd, state, state->flag); path_state_modify_bounce(state, false); #ifdef __BACKGROUND_MIS__ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 74357bd96fc..7e2f67bbd63 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -16,6 +16,17 @@ /* Constant Globals */ +#ifndef __KERNEL_GLOBALS_H__ +#define __KERNEL_GLOBALS_H__ + +#ifdef __KERNEL_CPU__ +# include "util/util_vector.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "util/util_atomic.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -42,16 +53,9 @@ struct VolumeStep; # define MAX_VOLUME 1024 typedef struct KernelGlobals { - texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU]; - texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU]; - texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU]; - texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU]; - texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU]; - texture_image_half texture_half_images[TEX_NUM_HALF_CPU]; - -# define KERNEL_TEX(type, ttype, name) ttype name; +# define KERNEL_TEX(type, name) texture<type> name; # define KERNEL_IMAGE_TEX(type, ttype, name) -# include "kernel_textures.h" +# include "kernel/kernel_textures.h" KernelData __data; @@ -72,7 +76,15 @@ typedef struct KernelGlobals { VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + /* split kernel */ + SplitData split_data; + SplitParams split_param_data; + + int2 global_size; + int2 global_id; + # ifdef WITH_OPENVDB + /* OpenVDB */ OpenVDBGlobals *vdb; OpenVDBThreadData *vdb_tdata; # endif @@ -88,15 +100,14 @@ typedef struct KernelGlobals { #ifdef __KERNEL_CUDA__ __constant__ KernelData __data; -typedef struct KernelGlobals {} KernelGlobals; +typedef struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + Intersection hits_stack[64]; +} KernelGlobals; -# ifdef __KERNEL_CUDA_TEX_STORAGE__ -# define KERNEL_TEX(type, ttype, name) ttype name; -# else -# define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name; -# endif +# define KERNEL_TEX(type, name) const __constant__ __device__ type *name; # define KERNEL_IMAGE_TEX(type, ttype, name) ttype name; -# include "kernel_textures.h" +# include "kernel/kernel_textures.h" #endif /* __KERNEL_CUDA__ */ @@ -104,19 +115,75 @@ typedef struct KernelGlobals {} KernelGlobals; #ifdef __KERNEL_OPENCL__ +# define KERNEL_TEX(type, name) \ +typedef type name##_t; +# include "kernel/kernel_textures.h" + typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; -# define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; -# include "kernel_textures.h" +# define KERNEL_TEX(type, name) \ + TextureInfo name; +# include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ - ShaderData *sd_input; - Intersection *isect_shadow; + SplitData split_data; + SplitParams split_param_data; # endif } KernelGlobals; +#define KERNEL_BUFFER_PARAMS \ + ccl_global char *buffer0, \ + ccl_global char *buffer1, \ + ccl_global char *buffer2, \ + ccl_global char *buffer3, \ + ccl_global char *buffer4, \ + ccl_global char *buffer5, \ + ccl_global char *buffer6, \ + ccl_global char *buffer7 + +#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7 + +ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS) +{ +#ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +#endif + { + kg->buffers[0] = buffer0; + kg->buffers[1] = buffer1; + kg->buffers[2] = buffer2; + kg->buffers[3] = buffer3; + kg->buffers[4] = buffer4; + kg->buffers[5] = buffer5; + kg->buffers[6] = buffer6; + kg->buffers[7] = buffer7; + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + +ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) +{ +# ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +# endif + { + ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0]; + +# define KERNEL_TEX(type, name) \ + kg->name = *(info++); +# include "kernel/kernel_textures.h" + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + #endif /* __KERNEL_OPENCL__ */ /* Interpolated lookup table access */ @@ -155,3 +222,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o CCL_NAMESPACE_END +#endif /* __KERNEL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h deleted file mode 100644 index 0352c58037d..00000000000 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright 2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/* For OpenCL all images are packed in a single array, and we do manual lookup - * and interpolation. */ - -ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) -{ - /* Float4 */ - if(id < TEX_START_BYTE4_OPENCL) { - return kernel_tex_fetch(__tex_image_float4_packed, offset); - } - /* Byte4 */ - else if(id < TEX_START_FLOAT_OPENCL) { - uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - /* Float */ - else if(id < TEX_START_BYTE_OPENCL) { - float f = kernel_tex_fetch(__tex_image_float_packed, offset); - return make_float4(f, f, f, 1.0f); - } - /* Byte */ - else { - uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset); - float f = r * (1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } -} - -ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) -{ - x %= width; - if(x < 0) - x += width; - return x; -} - -ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) -{ - return clamp(x, 0, width-1); -} - -ccl_device_inline float svm_image_texture_frac(float x, int *ix) -{ - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; -} - -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) -{ - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - - float4 r; - int ix, iy, nix, niy; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = svm_image_texture_read(kg, id, offset + ix + iy*width); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); - r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); - r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); - r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); - } - - return r; -} - - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) -{ - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - - float4 r; - int ix, iy, iz, nix, niy, niz; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - svm_image_texture_frac(z*depth, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - niz = svm_image_texture_wrap_periodic(iz+1, depth); - } - else { - if(extension == EXTENSION_CLIP) - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - niz = svm_image_texture_wrap_clamp(iz+1, depth); - - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height); - r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height); - r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height); - - r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height); - r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); - r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); - r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); - - } - - return r; -} diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index aec7bc33acd..f5855757d3f 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -149,6 +149,15 @@ ccl_device_inline uint cmj_hash(uint i, uint p) return i; } +ccl_device_inline uint cmj_hash_simple(uint i, uint p) +{ + i = (i ^ 61) ^ p; + i += i << 3; + i ^= i >> 4; + i *= 0x27d4eb2d; + return i; +} + ccl_device_inline float cmj_randfloat(uint i, uint p) { return cmj_hash(i, p) * (1.0f / 4294967808.0f); @@ -166,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p) return (x + jx)*invN; } -ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */ +ccl_device_inline int cmj_isqrt(int value) { - kernel_assert(s < N); - #if defined(__KERNEL_CUDA__) - int m = float_to_int(__fsqrt_ru(N)); + return float_to_int(__fsqrt_ru(value)); +#elif defined(__KERNEL_GPU__) + return float_to_int(sqrtf(value)); #else - int m = float_to_int(sqrtf(N)); + /* This is a work around for fast-math on CPU which might replace sqrtf() + * with am approximated version. + */ + return float_to_int(sqrtf(value) + 1e-6f); #endif +} + +ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +{ + kernel_assert(s < N); + + int m = cmj_isqrt(N); int n = (N - 1)/m + 1; float invN = 1.0f/N; float invm = 1.0f/m; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index d4cc36d1495..c806deee8e7 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; @@ -396,11 +396,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg, + (1.0f - portal_sampling_pdf) * cdf_pdf); } return D; - } else { + } + else { /* Sample map, but with nonzero portal_sampling_pdf for MIS. */ randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf); } - } else { + } + else { /* We can't sample a portal. * Check if we can sample the map instead. */ @@ -763,78 +765,280 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, /* Triangle Light */ -ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time) +/* returns true if the triangle is has motion blur or an instancing transform applied */ +ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3]) { + bool has_motion = false; + const int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) { + motion_triangle_vertices(kg, object, prim, time, V); + has_motion = true; + } + else { + triangle_vertices(kg, prim, V); + } + #ifdef __INSTANCING__ - /* instance transform */ - if(!(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { # ifdef __OBJECT_MOTION__ - Transform itfm; - Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm); + Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL); # else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); # endif - - ls->P = transform_point(&tfm, ls->P); - ls->Ng = normalize(transform_direction(&tfm, ls->Ng)); + V[0] = transform_point(&tfm, V[0]); + V[1] = transform_point(&tfm, V[1]); + V[2] = transform_point(&tfm, V[2]); + has_motion = true; } #endif + return has_motion; } -ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, - float randu, float randv, float time, LightSample *ls) +ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t) { - float u, v; + float pdf = kernel_data.integrator.pdf_triangles; + float cos_pi = fabsf(dot(Ng, I)); - /* compute random point in triangle */ - randu = sqrtf(randu); + if(cos_pi == 0.0f) + return 0.0f; + + return t*t*pdf/cos_pi; +} - u = 1.0f - randu; - v = randv*randu; +ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N = cross(e0, e1); + const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* sd contains the point on the light source + * calculate Px, the point that we're shading */ + const float3 Px = sd->P + sd->I * t; + const float3 v0_p = V[0] - Px; + const float3 v1_p = V[1] - Px; + const float3 v2_p = V[2] - Px; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float alpha = fast_acosf(dot(u02, u01)); + const float beta = fast_acosf(-dot(u01, u12)); + const float gamma = fast_acosf(dot(u02, u12)); + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */ + if(UNLIKELY(solid_angle == 0.0f)) { + return 0.0f; + } + else { + float area = 1.0f; + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + else { + area = 0.5f * len(N); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + return pdf / solid_angle; + } + } + else { + float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t); + if(has_motion) { + const float area = 0.5f * len(N); + if(UNLIKELY(area == 0.0f)) { + return 0.0f; + } + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + pdf = pdf * area_pre / area; + } + return pdf; + } +} - /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); +ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object, + float randu, float randv, float time, LightSample *ls, const float3 P) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N0 = cross(e0, e1); + float Nl = 0.0f; + ls->Ng = safe_normalize_len(N0, &Nl); + float area = 0.5f * Nl; + + /* flip normal if necessary */ + const int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + ls->Ng = -ls->Ng; + } + ls->eval_fac = 1.0f; + ls->shader = kernel_tex_fetch(__tri_shader, prim); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; ls->shader |= SHADER_USE_MIS; - ls->t = 0.0f; - ls->u = u; - ls->v = v; ls->type = LIGHT_TRIANGLE; - ls->eval_fac = 1.0f; - object_transform_light_sample(kg, ls, object, time); -} + float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0)); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* see James Arvo, "Stratified Sampling of Spherical Triangles" + * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */ + + /* project the triangle to the unit sphere + * and calculate its edges and angles */ + const float3 v0_p = V[0] - P; + const float3 v1_p = V[1] - P; + const float3 v2_p = V[2] - P; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float3 A = safe_normalize(v0_p); + const float3 B = safe_normalize(v1_p); + const float3 C = safe_normalize(v2_p); + + const float cos_alpha = dot(u02, u01); + const float cos_beta = -dot(u01, u12); + const float cos_gamma = dot(u02, u12); + + /* calculate dihedral angles */ + const float alpha = fast_acosf(cos_alpha); + const float beta = fast_acosf(cos_beta); + const float gamma = fast_acosf(cos_gamma); + /* the area of the unit spherical triangle = solid angle */ + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* precompute a few things + * these could be re-used to take several samples + * as they are independent of randu/randv */ + const float cos_c = dot(A, B); + const float sin_alpha = fast_sinf(alpha); + const float product = sin_alpha * cos_c; + + /* Select a random sub-area of the spherical triangle + * and calculate the third vertex C_ of that new triangle */ + const float phi = randu * solid_angle - alpha; + float s, t; + fast_sincosf(phi, &s, &t); + const float u = t - cos_alpha; + const float v = s + product; + + const float3 U = safe_normalize(C - dot(C, A) * A); + + float q = 1.0f; + const float det = ((v * s + u * t) * sin_alpha); + if(det != 0.0f) { + q = ((v * t - u * s) * cos_alpha - v) / det; + } + const float temp = max(1.0f - q*q, 0.0f); -ccl_device float triangle_light_pdf(KernelGlobals *kg, - const float3 Ng, const float3 I, float t) -{ - float pdf = kernel_data.integrator.pdf_triangles; - float cos_pi = fabsf(dot(Ng, I)); + const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U); - if(cos_pi == 0.0f) - return 0.0f; - - return t*t*pdf/cos_pi; + /* Finally, select a random point along the edge of the new triangle + * That point on the spherical triangle is the sampled ray direction */ + const float z = 1.0f - randv * (1.0f - dot(C_, B)); + ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B); + + /* calculate intersection with the planar triangle */ + if(!ray_triangle_intersect(P, ls->D, FLT_MAX, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)V, +#else + V[0], V[1], V[2], +#endif + &ls->u, &ls->v, &ls->t)) { + ls->pdf = 0.0f; + return; + } + + ls->P = P + ls->D * ls->t; + + /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */ + if(UNLIKELY(solid_angle == 0.0f)) { + ls->pdf = 0.0f; + return; + } + else { + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + ls->pdf = pdf / solid_angle; + } + } + else { + /* compute random point in triangle */ + randu = sqrtf(randu); + + const float u = 1.0f - randu; + const float v = randv*randu; + const float t = 1.0f - u - v; + ls->P = u * V[0] + v * V[1] + t * V[2]; + /* compute incoming direction, distance and pdf */ + ls->D = normalize_len(ls->P - P, &ls->t); + ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t); + if(has_motion && area != 0.0f) { + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + ls->pdf = ls->pdf * area_pre / area; + } + ls->u = u; + ls->v = v; + } } /* Light Distribution */ -ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) +ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu) { - /* this is basically std::upper_bound as used by pbrt, to find a point light or + /* This is basically std::upper_bound as used by pbrt, to find a point light or * triangle to emit from, proportional to area. a good improvement would be to * also sample proportional to power, though it's not so well defined with - * OSL shaders. */ + * arbitrary shaders. */ int first = 0; int len = kernel_data.integrator.num_distribution + 1; + float r = *randu; while(len > 0) { int half_len = len >> 1; int middle = first + half_len; - if(randt < kernel_tex_fetch(__light_distribution, middle).x) { + if(r < kernel_tex_fetch(__light_distribution, middle).x) { len = half_len; } else { @@ -843,9 +1047,17 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) } } - /* clamping should not be needed but float rounding errors seem to - * make this fail on rare occasions */ - return clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + /* Clamping should not be needed but float rounding errors seem to + * make this fail on rare occasions. */ + int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + + /* Rescale to reuse random number. this helps the 2D samples within + * each area light be stratified as well. */ + float distr_min = kernel_tex_fetch(__light_distribution, index).x; + float distr_max = kernel_tex_fetch(__light_distribution, index+1).x; + *randu = (r - distr_min)/(distr_max - distr_min); + + return index; } /* Generic Light */ @@ -857,7 +1069,6 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i } ccl_device_noinline bool light_sample(KernelGlobals *kg, - float randt, float randu, float randv, float time, @@ -866,7 +1077,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, LightSample *ls) { /* sample index */ - int index = light_distribution_sample(kg, randt); + int index = light_distribution_sample(kg, &randu); /* fetch light data */ float4 l = kernel_tex_fetch(__light_distribution, index); @@ -876,10 +1087,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, int object = __float_as_int(l.w); int shader_flag = __float_as_int(l.z); - triangle_light_sample(kg, prim, object, randu, randv, time, ls); - /* compute incoming direction, distance and pdf */ - ls->D = normalize_len(ls->P - P, &ls->t); - ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t); + triangle_light_sample(kg, prim, object, randu, randv, time, ls, P); ls->shader |= shader_flag; return (ls->pdf > 0.0f); } diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 9bee5603474..bd0e23b7705 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -17,11 +17,11 @@ #ifndef __KERNEL_MATH_H__ #define __KERNEL_MATH_H__ -#include "util_color.h" -#include "util_math.h" -#include "util_math_fast.h" -#include "util_texture.h" -#include "util_transform.h" +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_math_intersect.h" +#include "util/util_texture.h" +#include "util/util_transform.h" #endif /* __KERNEL_MATH_H__ */ - diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h index af7b727c1ba..9995490505f 100644 --- a/intern/cycles/kernel/kernel_montecarlo.h +++ b/intern/cycles/kernel/kernel_montecarlo.h @@ -67,8 +67,8 @@ ccl_device_inline void sample_cos_hemisphere(const float3 N, /* sample direction uniformly distributed in hemisphere */ ccl_device_inline void sample_uniform_hemisphere(const float3 N, - float randu, float randv, - float3 *omega_in, float *pdf) + float randu, float randv, + float3 *omega_in, float *pdf) { float z = randu; float r = sqrtf(max(0.0f, 1.0f - z*z)); @@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_hemisphere(const float3 N, /* sample direction uniformly distributed in cone */ ccl_device_inline void sample_uniform_cone(const float3 N, float angle, - float randu, float randv, - float3 *omega_in, float *pdf) + float randu, float randv, + float3 *omega_in, float *pdf) { float z = cosf(angle*randu); float r = sqrtf(max(0.0f, 1.0f - z*z)); @@ -187,4 +187,3 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u, CCL_NAMESPACE_END #endif /* __KERNEL_MONTECARLO_CL__ */ - diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 7aec47e4957..b31356905f2 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -16,19 +16,23 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) +#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) +#define __ATOMIC_PASS_WRITE__ +#endif + +ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ atomic_add_and_fetch_float(buf, value); #else - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif } -ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) +ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -38,13 +42,13 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa atomic_add_and_fetch_float(buf_z, value.z); #else ccl_global float3 *buf = (ccl_global float3*)buffer; - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif } -ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) +ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -56,12 +60,137 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa atomic_add_and_fetch_float(buf_w, value.w); #else ccl_global float4 *buf = (ccl_global float4*)buffer; - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif +} + +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value) +{ + kernel_write_pass_float(buffer, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ + kernel_write_pass_float(buffer+1, value*value); } +# ifdef __ATOMIC_PASS_WRITE__ +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value) +{ + buffer[0] += value.x; + buffer[1] += value.y; + buffer[2] += value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, value); + kernel_write_pass_float3_unaligned(buffer+3, value*value); +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, path_total); + kernel_write_pass_float(buffer+1, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); + kernel_write_pass_float(buffer+2, value*value); +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + /* Skip implicitly transparent surfaces. */ + if(sd->flag & SD_HAS_ONLY_VOLUME) { + return; + } + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(!bsdf_is_specular_like(sc)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + +#ifdef __KERNEL_DEBUG__ +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathRadiance *L) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASS_BVH_TRAVERSED_NODES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, + L->debug_data.num_bvh_traversed_nodes); + } + if(flag & PASS_BVH_TRAVERSED_INSTANCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, + L->debug_data.num_bvh_traversed_instances); + } + if(flag & PASS_BVH_INTERSECTIONS) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, + L->debug_data.num_bvh_intersections); + } + if(flag & PASS_RAY_BOUNCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, + L->debug_data.num_ray_bounces); + } +} +#endif /* __KERNEL_DEBUG__ */ + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) + ShaderData *sd, ccl_addr_space PathState *state, float3 throughput) { #ifdef __PASSES__ int path_flag = state->flag; @@ -75,38 +204,37 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || + if(!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { - - if(sample == 0) { + if(state->sample == 0) { if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, ccl_fetch(sd, P)); - kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); + float depth = camera_distance(kg, sd->P); + kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth); } if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, ccl_fetch(sd, object)); - kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); + float id = object_pass_id(kg, sd->object); + kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id); } if(flag & PASS_MATERIAL_ID) { float id = shader_pass_id(kg, sd); - kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, sample, id); + kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id); } } if(flag & PASS_NORMAL) { - float3 normal = ccl_fetch(sd, N); - kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); + float3 normal = shader_bsdf_average_normal(kg, sd); + kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal); } if(flag & PASS_UV) { float3 uv = primitive_uv(kg, sd); - kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, sample, uv); + kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv); } if(flag & PASS_MOTION) { float4 speed = primitive_motion_vector(kg, sd); - kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, sample, speed); - kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, sample, 1.0f); + kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed); + kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f); } state->flag |= PATH_RAY_SINGLE_PASS_DONE; @@ -127,7 +255,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ @@ -149,7 +277,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl #endif } -ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample) +ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L) { #ifdef __PASSES__ int flag = kernel_data.film.pass_flag; @@ -158,44 +286,103 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f return; if(flag & PASS_DIFFUSE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, sample, L->indirect_diffuse); + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse); if(flag & PASS_GLOSSY_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, sample, L->indirect_glossy); + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy); if(flag & PASS_TRANSMISSION_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, sample, L->indirect_transmission); + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, L->indirect_transmission); if(flag & PASS_SUBSURFACE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, sample, L->indirect_subsurface); + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, L->indirect_subsurface); if(flag & PASS_DIFFUSE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, sample, L->direct_diffuse); + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse); if(flag & PASS_GLOSSY_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, sample, L->direct_glossy); + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy); if(flag & PASS_TRANSMISSION_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, sample, L->direct_transmission); + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, L->direct_transmission); if(flag & PASS_SUBSURFACE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, sample, L->direct_subsurface); + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, L->direct_subsurface); if(flag & PASS_EMISSION) - kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, sample, L->emission); + kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission); if(flag & PASS_BACKGROUND) - kernel_write_pass_float3(buffer + kernel_data.film.pass_background, sample, L->background); + kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background); if(flag & PASS_AO) - kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, sample, L->ao); + kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao); if(flag & PASS_DIFFUSE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, sample, L->color_diffuse); + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse); if(flag & PASS_GLOSSY_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, sample, L->color_glossy); + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy); if(flag & PASS_TRANSMISSION_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, sample, L->color_transmission); + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, L->color_transmission); if(flag & PASS_SUBSURFACE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, sample, L->color_subsurface); + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface); if(flag & PASS_SHADOW) { float4 shadow = L->shadow; shadow.w = kernel_data.film.pass_shadow_scale; - kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, sample, shadow); + kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, shadow); } if(flag & PASS_MIST) - kernel_write_pass_float(buffer + kernel_data.film.pass_mist, sample, 1.0f - L->mist); + kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist); +#endif +} + +ccl_device_inline void kernel_write_result(KernelGlobals *kg, + ccl_global float *buffer, + int sample, + PathRadiance *L) +{ + float alpha; + float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha); + + kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, + buffer + kernel_data.film.pass_denoising_data, + sample, + average(L->path_total), + average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, + buffer + kernel_data.film.pass_denoising_data, + sample, + 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; + path_radiance_split_denoising(kg, L, &noisy, &clean); + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + noisy); + kernel_write_pass_float3_unaligned( + buffer + kernel_data.film.pass_denoising_clean, + clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + ensure_finite3(L_sum)); + } + + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + L->denoising_normal); + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + L->denoising_albedo); + kernel_write_pass_float_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + + +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, L); #endif } diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 6d89a89ed5b..652777a77a0 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -15,57 +15,344 @@ */ #ifdef __OSL__ -# include "osl_shader.h" +# include "kernel/osl/osl_shader.h" #endif -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_camera.h" -#include "geom/geom.h" -#include "bvh/bvh.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" #ifdef __SUBSURFACE__ -# include "kernel_subsurface.h" +# include "kernel/kernel_subsurface.h" #endif #ifdef __VOLUME__ -# include "kernel_volume.h" +# include "kernel/kernel_volume.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shadow.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_path_common.h" +#include "kernel/kernel_path_surface.h" +#include "kernel/kernel_path_volume.h" +#include "kernel/kernel_path_subsurface.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_forceinline bool kernel_path_scene_intersect( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + Intersection *isect, + PathRadiance *L) +{ + uint visibility = path_state_ray_visibility(kg, state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray->dD.dx + ray->dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init_addrspace(state, 0x51633e2d); + } + + if(path_state_ao_bounce(kg, state)) { + visibility = PATH_RAY_SHADOW; + ray->t = kernel_data.background.ao_distance; + } + + bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif /* __HAIR__ */ #ifdef __KERNEL_DEBUG__ -# include "kernel_debug.h" -#endif + if(state->flag & PATH_RAY_CAMERA) { + L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances; + L->debug_data.num_bvh_intersections += isect->num_intersections; + } + L->debug_data.num_ray_bounces++; +#endif /* __KERNEL_DEBUG__ */ -CCL_NAMESPACE_BEGIN + return hit; +} + +ccl_device_forceinline void kernel_path_lamp_emission( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + float3 throughput, + ccl_addr_space Intersection *isect, + ShaderData *emission_sd, + PathRadiance *L) +{ +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray->P - state->ray_t*ray->D; + state->ray_t += isect->t; + light_ray.D = ray->D; + light_ray.t = state->ray_t; + light_ray.time = ray->time; + light_ray.dD = ray->dD; + light_ray.dP = ray->dP; + + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __LAMP_MIS__ */ +} + +ccl_device_forceinline void kernel_path_background( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + L->transparent += average(throughput); + +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif /* __PASSES__ */ + return; + } + +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, emission_sd, state, ray); + path_radiance_accum_background(L, state, throughput, L_background); +#endif /* __BACKGROUND__ */ +} + +#ifndef __SPLIT_KERNEL__ + +#ifdef __VOLUME__ +ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( + KernelGlobals *kg, + ShaderData *sd, + PathState *state, + Ray *ray, + float3 *throughput, + ccl_addr_space Intersection *isect, + bool hit, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + + if(state->volume_stack[0].shader == SHADER_NONE) { + return VOLUME_PATH_ATTENUATED; + } + + /* volume attenuation, emission, scatter */ + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + +# ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state->volume_stack); + bool direct = (state->flag & PATH_RAY_CAMERA) != 0; + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + + shader_setup_from_volume(kg, sd, &volume_ray); + kernel_volume_decoupled_record(kg, state, + &volume_ray, sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + int all = kernel_data.integrator.sample_all_lights_indirect; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, sd, + emission_sd, *throughput, state, L, all, + &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + state, &volume_ray, sd, throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } + else { + *throughput *= volume_segment.accum_transmittance; + } + } + else +# endif /* __VOLUME_DECOUPLED__ */ + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } +# endif /* __VOLUME_SCATTER__ */ + } + + return VOLUME_PATH_ATTENUATED; +} +#endif /* __VOLUME__ */ + +#endif /* __SPLIT_KERNEL__ */ + +ccl_device_forceinline bool kernel_path_shader_apply( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *emission_sd, + PathRadiance *L, + ccl_global float *buffer) +{ +#ifdef __SHADOW_TRICKS__ + if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if(state->flag & PATH_RAY_CAMERA) { + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); + + float3 bg = make_float3(0.0f, 0.0f, 0.0f); + if(!kernel_data.background.transparent) { + bg = indirect_background(kg, emission_sd, state, ray); + } + path_radiance_accum_shadowcatcher(L, throughput, bg); + } + } + else if(state->flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, sd)); + } +#endif /* __SHADOW_TRICKS__ */ + + /* holdout */ +#ifdef __HOLDOUT__ + if(((sd->flag & SD_HOLDOUT) || + (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && + (state->flag & PATH_RAY_CAMERA)) + { + if(kernel_data.background.transparent) { + float3 holdout_weight; + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + } + else { + holdout_weight = shader_holdout_eval(kg, sd); + } + /* any throughput is ok, should all be identical here */ + L->transparent += average(holdout_weight*throughput); + } + + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + return false; + } + } +#endif /* __HOLDOUT__ */ + + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, buffer, L, sd, state, throughput); + + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __EMISSION__ */ + + return true; +} ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, - RNG *rng, + ccl_addr_space PathState *state, float3 throughput, float3 ao_alpha) { /* todo: solve correlation */ float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -75,267 +362,107 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.time = sd->time; + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); + } + else { + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } +#ifndef __SPLIT_KERNEL__ + +#if defined(__BRANCHED_PATH__) || defined(__BAKING__) + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, - RNG *rng, Ray *ray, float3 throughput, - int num_samples, PathState *state, PathRadiance *L) { /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, state); - bool hit = scene_intersect(kg, - *ray, - visibility, - &isect, - NULL, - 0.0f, 0.0f); + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray->P - state->ray_t*ray->D; - state->ray_t += isect.t; - light_ray.D = ray->D; - light_ray.t = state->ray_t; - light_ray.time = ray->time; - light_ray.dD = ray->dD; - light_ray.dP = ray->dP; - - /* intersect with lamp */ - float3 emission; - if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) { - path_radiance_accum_emission(L, - throughput, - emission, - state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L); #ifdef __VOLUME__ - /* volume attenuation, emission, scatter */ - if(state->volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = *ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = - volume_stack_is_heterogeneous(kg, - state->volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = - volume_stack_sampling_method(kg, - state->volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, - sd, - &volume_ray); - kernel_volume_decoupled_record(kg, - state, - &volume_ray, - sd, - &volume_segment, - heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) { - path_radiance_accum_emission(L, - throughput, - volume_segment.accum_emission, - state->bounce); - } - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = kernel_data.integrator.sample_all_lights_indirect; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L, - all, - &volume_ray, - &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - state, - &volume_ray, - sd, - &throughput, - rphase, - rscatter, - &volume_segment, - NULL, - true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; } -#endif /* __VOLUME__ */ + else if(result == VOLUME_PATH_MISSED) { + break; + } +#endif /* __VOLUME__*/ + /* Shade background. */ if(!hit) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, emission_sd, state, ray); - path_radiance_accum_background(L, - throughput, - L_background, - state->bounce); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, state, ray, throughput, emission_sd, L); + break; + } + else if(path_state_ao_bounce(kg, state)) { break; } - /* setup shading */ + /* Setup and evaluate shader. */ shader_setup_from_ray(kg, sd, &isect, ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); -#ifdef __BRANCHED_PATH__ - shader_merge_closures(sd); -#endif /* __BRANCHED_PATH__ */ - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(sd->flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, - sd, - isect.t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); + shader_eval_surface(kg, sd, state, state->flag); + shader_prepare_closures(sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + NULL)) + { + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = - path_state_terminate_probability(kg, - state, - throughput*num_samples); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -343,10 +470,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { - kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f)); } #endif /* __AO__ */ @@ -354,22 +483,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* bssrdf scatter to a different location on the same object, replacing * the closures with a diffuse BSDF */ if(sd->flag & SD_BSSRDF) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - rng, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); + subsurface_scatter_step(kg, sd, state, @@ -382,11 +507,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) +#if defined(__EMISSION__) if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_indirect; + int all = (kernel_data.integrator.sample_all_lights_indirect) || + (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -395,205 +520,26 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, L, all); } -#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */ +#endif /* defined(__EMISSION__) */ - if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) + if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray)) break; } } -#ifdef __SUBSURFACE__ -# ifndef __KERNEL_CUDA__ -ccl_device -# else -ccl_device_inline -# endif -bool kernel_path_subsurface_scatter( - KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - PathState *state, - RNG *rng, - Ray *ray, - float3 *throughput, - SubsurfaceIndirectRays *ss_indirect) -{ - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; - - /* do bssrdf scatter step if we picked a bssrdf closure */ - if(sc) { - /* We should never have two consecutive BSSRDF bounces, - * the second one should be converted to a diffuse BSDF to - * avoid this. - */ - kernel_assert(!ss_indirect->tracing); - - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - - SubsurfaceIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_intersect(kg, - &ss_isect, - sd, - sc, - &lcg_state, - bssrdf_u, bssrdf_v, - false); -# ifdef __VOLUME__ - ss_indirect->need_update_volume_stack = - kernel_data.integrator.use_volumes && - ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - /* NOTE: We reuse the existing ShaderData, we assume the path - * integration loop stops when this function returns true. - */ - subsurface_scatter_multi_setup(kg, - &ss_isect, - hit, - sd, - state, - state->flag, - sc, - false); - - PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; - Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; - float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; - PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays]; - - *hit_state = *state; - *hit_ray = *ray; - *hit_tp = *throughput; - - hit_state->rng_offset += PRNG_BOUNCE_NUM; - - path_radiance_init(hit_L, kernel_data.film.use_light_pass); - hit_L->direct_throughput = L->direct_throughput; - path_radiance_copy_indirect(hit_L, L); - - kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); - - if(kernel_path_surface_bounce(kg, - rng, - sd, - hit_tp, - hit_state, - hit_L, - hit_ray)) - { -# ifdef __LAMP_MIS__ - hit_state->ray_t = 0.0f; -# endif /* __LAMP_MIS__ */ - -# ifdef __VOLUME__ - if(ss_indirect->need_update_volume_stack) { - Ray volume_ray = *ray; - /* Setup ray from previous surface point to the new one. */ - volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, - &volume_ray.t); - - kernel_volume_stack_update_for_subsurface( - kg, - emission_sd, - &volume_ray, - hit_state->volume_stack); - } -# endif /* __VOLUME__ */ - path_radiance_reset_indirect(L); - ss_indirect->num_rays++; - } - else { - path_radiance_accum_sample(L, hit_L, 1); - } - } - return true; - } - return false; -} - -ccl_device_inline void kernel_path_subsurface_init_indirect( - SubsurfaceIndirectRays *ss_indirect) -{ - ss_indirect->tracing = false; - ss_indirect->num_rays = 0; -} - -ccl_device void kernel_path_subsurface_accum_indirect( - SubsurfaceIndirectRays *ss_indirect, - PathRadiance *L) +ccl_device_forceinline void kernel_path_integrate( + KernelGlobals *kg, + PathState *state, + float3 throughput, + Ray *ray, + PathRadiance *L, + ccl_global float *buffer, + ShaderData *emission_sd) { - if(ss_indirect->tracing) { - path_radiance_sum_indirect(L); - path_radiance_accum_sample(&ss_indirect->direct_L, L, 1); - if(ss_indirect->num_rays == 0) { - *L = ss_indirect->direct_L; - } - } -} - -ccl_device void kernel_path_subsurface_setup_indirect( - KernelGlobals *kg, - SubsurfaceIndirectRays *ss_indirect, - PathState *state, - Ray *ray, - PathRadiance *L, - float3 *throughput) -{ - if(!ss_indirect->tracing) { - ss_indirect->direct_L = *L; - } - ss_indirect->tracing = true; - - /* Setup state, ray and throughput for indirect SSS rays. */ - ss_indirect->num_rays--; - - Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays]; - PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays]; - - *state = ss_indirect->state[ss_indirect->num_rays]; - *ray = *indirect_ray; - *L = *indirect_L; - *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; - - state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; -} - -#endif /* __SUBSURFACE__ */ - -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) -{ - /* initialize */ - PathRadiance L; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - - path_radiance_init(&L, kernel_data.film.use_light_pass); - - /* shader data memory used for both volumes and surfaces, saves stack space */ + /* Shader data memory used for both volumes and surfaces, saves stack space. */ ShaderData sd; - /* shader data used by emission, shadows, volume stacks */ - ShaderData emission_sd; - - PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ #ifdef __SUBSURFACE__ SubsurfaceIndirectRays ss_indirect; @@ -604,231 +550,82 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); - } + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - } - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ - -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray.P - state.ray_t*ray.D; - state.ray_t += isect.t; - light_ray.D = ray.D; - light_ray.t = state.ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - - /* intersect with lamp */ - float3 emission; - - if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L); #ifdef __VOLUME__ - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &sd, &volume_segment, heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = false; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, - &volume_ray, &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - &state, &volume_ray, &sd, &throughput, - rphase, rscatter, &volume_segment, NULL, true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) - continue; - else - break; - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) - continue; - else - break; - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + &sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; } -#endif /* __VOLUME__ */ - - if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); -#endif /* __BACKGROUND__ */ - + else if(result == VOLUME_PATH_MISSED) { break; } +#endif /* __VOLUME__*/ - /* setup shading */ - shader_setup_from_ray(kg, &sd, &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); - shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - - /* holdout */ -#ifdef __HOLDOUT__ - if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state.flag & PATH_RAY_CAMERA)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - - if(sd.flag & SD_HOLDOUT_MASK) - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - else - holdout_weight = shader_holdout_eval(kg, &sd); - - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - - if(sd.flag & SD_HOLDOUT_MASK) - break; + /* Shade background. */ + if(!hit) { + kernel_path_background(kg, state, ray, throughput, emission_sd, L); + break; } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, &sd, blur_roughness); - } + else if(path_state_ao_bounce(kg, state)) { + break; } -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - /* todo: is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + /* Setup and evaluate shader. */ + shader_setup_from_ray(kg, &sd, &isect, ray); + shader_eval_surface(kg, &sd, state, state->flag); + shader_prepare_closures(&sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) + { + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; throughput /= probability; } + kernel_update_denoising_features(kg, &sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -838,11 +635,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_BSSRDF) { if(kernel_path_subsurface_scatter(kg, &sd, - &emission_sd, - &L, - &state, - rng, - &ray, + emission_sd, + L, + state, + ray, &throughput, &ss_indirect)) { @@ -852,25 +648,23 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); - /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. */ if(ss_indirect.num_rays) { kernel_path_subsurface_setup_indirect(kg, &ss_indirect, - &state, - &ray, - &L, + state, + ray, + L, &throughput); } else { @@ -878,48 +672,51 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } } #endif /* __SUBSURFACE__ */ - - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - - kernel_write_light_passes(kg, buffer, &L, sample); - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } ccl_device void kernel_path_trace(KernelGlobals *kg, - ccl_global float *buffer, ccl_global uint *rng_state, + ccl_global float *buffer, int sample, int x, int y, int offset, int stride) { /* buffer offset */ int index = offset + x + y*stride; int pass_stride = kernel_data.film.pass_stride; - rng_state += index; buffer += index*pass_stride; - /* initialize random numbers and ray */ - RNG rng; + /* Initialize random numbers and sample ray. */ + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); - /* integrate */ - float4 L; + if(ray.t == 0.0f) { + return; + } - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Initialize state. */ + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + PathRadiance L; + path_radiance_init(&L, kernel_data.film.use_light_pass); - path_rng_end(kg, rng_state, rng); + ShaderData emission_sd; + PathState state; + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); + + /* Integrate. */ + kernel_path_integrate(kg, + &state, + throughput, + &ray, + &L, + buffer, + &emission_sd); + + kernel_write_result(kg, buffer, sample, &L); } +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index c84727ace99..42df7e85b41 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -22,8 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, - RNG *rng, + ccl_addr_space PathState *state, float3 throughput) { int num_samples = kernel_data.integrator.ao_samples; @@ -35,46 +34,225 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, for(int j = 0; j < num_samples; j++) { float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float3 ao_D; float ao_pdf; sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.time = sd->time; + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); + } + else { + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); + } } } } +#ifndef __SPLIT_KERNEL__ + +#ifdef __VOLUME__ +ccl_device_forceinline void kernel_branched_path_volume( + KernelGlobals *kg, + ShaderData *sd, + PathState *state, + Ray *ray, + float3 *throughput, + ccl_addr_space Intersection *isect, + bool hit, + ShaderData *indirect_sd, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + + if(state->volume_stack[0].shader == SHADER_NONE) { + return; + } + + /* volume attenuation, emission, scatter */ + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + +# ifdef __VOLUME_DECOUPLED__ + /* decoupled ray marching only supported on CPU */ + if(kernel_data.integrator.volume_decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + + shader_setup_from_volume(kg, sd, &volume_ray); + kernel_volume_decoupled_record(kg, state, + &volume_ray, sd, &volume_segment, heterogeneous); + + /* direct light sampling */ + if(volume_segment.closure_flag & SD_SCATTER) { + volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack); + + int all = kernel_data.integrator.sample_all_lights_direct; + + kernel_branched_path_volume_connect_light(kg, sd, + emission_sd, *throughput, state, L, all, + &volume_ray, &volume_segment); + + /* indirect light sampling */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + for(int j = 0; j < num_samples; j++) { + PathState ps = *state; + Ray pray = *ray; + float3 tp = *throughput; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + /* scatter sample. if we use distance sampling and take just one + * sample for direct and indirect light, we could share this + * computation, but makes code a bit complex */ + float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false); + + if(result == VOLUME_PATH_SCATTERED && + kernel_path_volume_bounce(kg, + sd, + &tp, + &ps, + &L->state, + &pray)) + { + kernel_path_indirect(kg, + indirect_sd, + emission_sd, + &pray, + tp*num_samples_inv, + &ps, + L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } + } + + /* emission and transmittance */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission); + *throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + } + else +# endif /* __VOLUME_DECOUPLED__ */ + { + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + /* todo: we should cache the shader evaluations from stepping + * through the volume, for now we redo them multiple times */ + + for(int j = 0; j < num_samples; j++) { + PathState ps = *state; + Ray pray = *ray; + float3 tp = (*throughput) * num_samples_inv; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &ps, sd, &volume_ray, L, &tp, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* todo: support equiangular, MIS and all light sampling. + * alternatively get decoupled ray marching working on the GPU */ + kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L); + + if(kernel_path_volume_bounce(kg, + sd, + &tp, + &ps, + &L->state, + &pray)) + { + kernel_path_indirect(kg, + indirect_sd, + emission_sd, + &pray, + tp, + &ps, + L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } +# endif /* __VOLUME_SCATTER__ */ + } + + /* todo: avoid this calculation using decoupled ray marching */ + kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput); + } +} +#endif /* __VOLUME__ */ /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, + ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -90,34 +268,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { PathState ps = *state; float3 tp = throughput; Ray bsdf_ray; +#ifdef __SHADOW_TRICKS__ + float shadow_transparency = L->shadow_transparency; +#endif + + ps.rng_hash = cmj_hash(state->rng_hash, i); if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, - L, - &bsdf_ray)) + &L->state, + &bsdf_ray, + sum_sample_weight)) { continue; } + ps.rng_hash = state->rng_hash; + kernel_path_indirect(kg, indirect_sd, emission_sd, - rng, &bsdf_ray, tp*num_samples_inv, - num_samples, &ps, L); @@ -125,6 +307,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba * for the next samples */ path_radiance_sum_indirect(L); path_radiance_reset_indirect(L); + +#ifdef __SHADOW_TRICKS__ + L->shadow_transparency = shadow_transparency; +#endif } } } @@ -136,28 +322,27 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, PathState *state, - RNG *rng, Ray *ray, float3 throughput) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSSRDF(sc->type)) continue; /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); + uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ for(int j = 0; j < num_samples; j++) { SubsurfaceIntersection ss_isect; float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, @@ -167,8 +352,9 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, true); #ifdef __VOLUME__ Ray volume_ray = *ray; - bool need_update_volume_stack = kernel_data.integrator.use_volumes && - ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; #endif /* __VOLUME__ */ /* compute lighting with the BSDF closure */ @@ -205,10 +391,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, #ifdef __EMISSION__ /* direct light */ if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; + int all = (kernel_data.integrator.sample_all_lights_direct) || + (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light( kg, - rng, &bssrdf_sd, emission_sd, &hit_state, @@ -222,7 +408,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, /* indirect light */ kernel_branched_path_surface_indirect_light( kg, - rng, &bssrdf_sd, indirect_sd, emission_sd, @@ -236,14 +421,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, + uint rng_hash, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -251,264 +439,67 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in ShaderData emission_sd, indirect_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); /* Main Loop * Here we only handle transparency intersections from the camera ray. * Indirect bounces are handled in kernel_branched_path_surface_indirect_light(). */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if(kernel_data.cam.resolution == 1) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_DEBUG__ - debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ + bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L); #ifdef __VOLUME__ - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -#ifdef __VOLUME_DECOUPLED__ - /* decoupled ray marching only supported on CPU */ - - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &sd, &volume_segment, heterogeneous); - - /* direct light sampling */ - if(volume_segment.closure_flag & SD_SCATTER) { - volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - - int all = kernel_data.integrator.sample_all_lights_direct; - - kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, - &volume_ray, &volume_segment); - - /* indirect light sampling */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); - - PathState ps = state; - Ray pray = ray; - float3 tp = throughput; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - /* scatter sample. if we use distance sampling and take just one - * sample for direct and indirect light, we could share this - * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); - - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - - if(kernel_path_volume_bounce(kg, - rng, - &sd, - &tp, - &ps, - &L, - &pray)) - { - kernel_path_indirect(kg, - &indirect_sd, - &emission_sd, - rng, - &pray, - tp*num_samples_inv, - num_samples, - &ps, - &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } - } - - /* emission and transmittance */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); - throughput *= volume_segment.accum_transmittance; - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); -#else - /* GPU: no decoupled ray marching, scatter probalistically */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - /* todo: we should cache the shader evaluations from stepping - * through the volume, for now we redo them multiple times */ - - for(int j = 0; j < num_samples; j++) { - PathState ps = state; - Ray pray = ray; - float3 tp = throughput * num_samples_inv; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); - -#ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* todo: support equiangular, MIS and all light sampling. - * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); - - if(kernel_path_volume_bounce(kg, - rng, - &sd, - &tp, - &ps, - &L, - &pray)) - { - kernel_path_indirect(kg, - &indirect_sd, - &emission_sd, - rng, - &pray, - tp, - num_samples, - &ps, - &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } -#endif /* __VOLUME_SCATTER__ */ - } - - /* todo: avoid this calculation using decoupled ray marching */ - kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput); -#endif /* __VOLUME_DECOUPLED__ */ - } + /* Volume integration. */ + kernel_branched_path_volume(kg, + &sd, + &state, + &ray, + &throughput, + &isect, + hit, + &indirect_sd, + &emission_sd, + L); #endif /* __VOLUME__ */ + /* Shade background. */ if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L); break; } - /* setup shading */ + /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, state.flag); shader_merge_closures(&sd); - /* holdout */ -#ifdef __HOLDOUT__ - if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - - if(sd.flag & SD_HOLDOUT_MASK) - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - else - holdout_weight = shader_holdout_eval(kg, &sd); - - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - - if(sd.flag & SD_HOLDOUT_MASK) - break; - } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); - -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + &state, + &ray, + throughput, + &emission_sd, + L, + buffer)) + { + break; } -#endif /* __EMISSION__ */ /* transparency termination */ if(state.flag & PATH_RAY_TRANSPARENT) { /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, &state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -517,10 +508,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput); } #endif /* __AO__ */ @@ -528,7 +521,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + L, &state, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -538,15 +531,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __EMISSION__ /* direct light */ if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); + int all = (kernel_data.integrator.sample_all_lights_direct) || + (state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + kernel_branched_path_surface_indirect_light(kg, + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -574,50 +568,35 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); #endif /* __VOLUME__ */ } - - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - - kernel_write_light_passes(kg, buffer, &L, sample); - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, - ccl_global float *buffer, ccl_global uint *rng_state, + ccl_global float *buffer, int sample, int x, int y, int offset, int stride) { /* buffer offset */ int index = offset + x + y*stride; int pass_stride = kernel_data.film.pass_stride; - rng_state += index; buffer += index*pass_stride; /* initialize random numbers and ray */ - RNG rng; + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + PathRadiance L; - path_rng_end(kg, rng_state, rng); + if(ray.t != 0.0f) { + kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L); + kernel_write_result(kg, buffer, sample, &L); + } } +#endif /* __SPLIT_KERNEL__ */ + #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h index 13597eab287..d83fd474cde 100644 --- a/intern/cycles/kernel/kernel_path_common.h +++ b/intern/cycles/kernel/kernel_path_common.h @@ -14,15 +14,14 @@ * limitations under the License. */ -#include "util_hash.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, - ccl_global uint *rng_state, int sample, int x, int y, - ccl_addr_space RNG *rng, + uint *rng_hash, ccl_addr_space Ray *ray) { float filter_u; @@ -30,24 +29,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, int num_samples = kernel_data.integrator.aa_samples; - if(sample == 0) { - *rng_state = hash_int_2d(x, y); - } - - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v); /* sample camera ray */ float lens_u = 0.0f, lens_v = 0.0f; if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); float time = 0.0f; #ifdef __CAMERA_MOTION__ if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); + time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME); #endif camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 661dc52fb31..eccee54c0e3 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init(KernelGlobals *kg, ShaderData *stack_sd, ccl_addr_space PathState *state, - ccl_addr_space RNG *rng, + uint rng_hash, int sample, ccl_addr_space Ray *ray) { state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; + state->rng_hash = rng_hash; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; state->num_samples = kernel_data.integrator.aa_samples; + state->branch_factor = 1.0f; state->bounce = 0; state->diffuse_bounce = 0; @@ -35,6 +37,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -48,7 +60,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, /* Initialize volume stack with volume we are inside of. */ kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack); /* Seed RNG for cases where we can't use stratified samples .*/ - state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); + state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d); } else { state->volume_stack[0].shader = SHADER_NONE; @@ -64,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - /* don't increase random number generator offset here, to avoid some - * unwanted patterns, see path_state_rng_1D_for_decision */ - if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; + /* random number generator next bounce */ + state->rng_offset += PRNG_BOUNCE_NUM; + return; } @@ -124,9 +136,15 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + +#ifdef __DENOISING_FEATURES__ + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } +#endif } -ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) +ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state) { uint flag = state->flag & PATH_RAY_ALL_VISIBILITY; @@ -140,17 +158,28 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) +ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg, + ccl_addr_space PathState *state, + const float3 throughput) { if(state->flag & PATH_RAY_TRANSPARENT) { - /* transparent rays treated separately */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) + /* Transparent rays are treated separately with own max bounces. */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { return 0.0f; - else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) + } + /* Do at least one bounce without RR. */ + else if(state->transparent_bounce <= 1) { return 1.0f; + } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { + return 1.0f; + } +#endif } else { - /* other rays */ + /* Test max bounces for various ray types. */ if((state->bounce >= kernel_data.integrator.max_bounce) || (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) || (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) || @@ -161,13 +190,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_ { return 0.0f; } - else if(state->bounce <= kernel_data.integrator.min_bounce) { + /* Do at least one bounce without RR. */ + else if(state->bounce <= 1) { return 1.0f; } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { + return 1.0f; + } +#endif } - /* probalistic termination */ - return average(throughput); /* todo: try using max here */ + /* Probalistic termination: use sqrt() to roughly match typical view + * transform and do path termination a bit later on average. */ + return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f); } /* TODO(DingTo): Find more meaningful name for this */ @@ -180,5 +217,30 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, state->bounce -= 1; } +ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state) +{ + if(state->bounce <= kernel_data.integrator.ao_bounces) { + return false; + } + + int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0); + return (bounce > kernel_data.integrator.ao_bounces); +} + +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, + int branch, + int num_branches) +{ + state->rng_offset += PRNG_BOUNCE_NUM; + + if(num_branches > 1) { + /* Path is splitting into a branch, adjust so that each branch + * still gets a unique sample from the same sequence. */ + state->sample = state->sample*num_branches + branch; + state->num_samples = state->num_samples*num_branches; + state->branch_factor *= num_branches; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h new file mode 100644 index 00000000000..1436e8e5a5b --- /dev/null +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -0,0 +1,156 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __SUBSURFACE__ +# ifndef __KERNEL_CUDA__ +ccl_device +# else +ccl_device_inline +# endif +bool kernel_path_subsurface_scatter( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + PathRadiance *L, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + ccl_addr_space float3 *throughput, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + /* We should never have two consecutive BSSRDF bounces, + * the second one should be converted to a diffuse BSDF to + * avoid this. + */ + kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR)); + + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); + + SubsurfaceIntersection ss_isect; + int num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); +# ifdef __VOLUME__ + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +# endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + /* NOTE: We reuse the existing ShaderData, we assume the path + * integration loop stops when this function returns true. + */ + subsurface_scatter_multi_setup(kg, + &ss_isect, + hit, + sd, + state, + state->flag, + sc, + false); + + kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L); + + ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; + ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; + ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; + PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays]; + + *hit_state = *state; + *hit_ray = *ray; + *hit_tp = *throughput; + *hit_L_state = L->state; + + hit_state->rng_offset += PRNG_BOUNCE_NUM; + + if(kernel_path_surface_bounce(kg, + sd, + hit_tp, + hit_state, + hit_L_state, + hit_ray)) + { +# ifdef __LAMP_MIS__ + hit_state->ray_t = 0.0f; +# endif /* __LAMP_MIS__ */ + +# ifdef __VOLUME__ + if(need_update_volume_stack) { + Ray volume_ray = *ray; + /* Setup ray from previous surface point to the new one. */ + volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, + &volume_ray.t); + + kernel_volume_stack_update_for_subsurface( + kg, + emission_sd, + &volume_ray, + hit_state->volume_stack); + } +# endif /* __VOLUME__ */ + ss_indirect->num_rays++; + } + } + return true; + } + return false; +} + +ccl_device_inline void kernel_path_subsurface_init_indirect( + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + ss_indirect->num_rays = 0; +} + +ccl_device void kernel_path_subsurface_setup_indirect( + KernelGlobals *kg, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput) +{ + /* Setup state, ray and throughput for indirect SSS rays. */ + ss_indirect->num_rays--; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + *state = ss_indirect->state[ss_indirect->num_rays]; + *ray = ss_indirect->rays[ss_indirect->num_rays]; + L->state = ss_indirect->L_state[ss_indirect->num_rays]; + *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; + + state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; +} + +#endif /* __SUBSURFACE__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index fea503d06e5..7b566b01b04 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -16,16 +16,21 @@ CCL_NAMESPACE_BEGIN -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) - +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ -ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput, - float num_samples_adjust, PathRadiance *L, int sample_all_lights) +ccl_device_noinline void kernel_branched_path_surface_connect_light( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + ccl_addr_space PathState *state, + float3 throughput, + float num_samples_adjust, + PathRadiance *L, + int sample_all_lights) { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) + if(!(sd->flag & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -33,7 +38,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal bool is_lamp; # ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; # endif if(sample_all_lights) { @@ -44,15 +49,15 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples); LightSample ls; - if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) { + if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { /* The sampling probability returned by lamp_light_sample assumes that all lights were sampled. * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */ if(kernel_data.integrator.pdf_triangles != 0.0f) @@ -62,9 +67,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -77,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float num_samples_inv = num_samples_adjust/num_samples; for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */ if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; @@ -96,9 +103,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,21 +117,23 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal } else { /* sample one light at random */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -130,9 +142,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal } /* branched path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, - float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +ccl_device bool kernel_branched_path_surface_bounce( + KernelGlobals *kg, + ShaderData *sd, + const ShaderClosure *sc, + int sample, + int num_samples, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadianceState *L_state, + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -140,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, @@ -150,21 +170,25 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif /* modify path state */ path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = ccl_fetch(sd, time); + ray->time = sd->time; #endif #ifdef __VOLUME__ @@ -188,64 +212,77 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, #endif -#ifndef __SPLIT_KERNEL__ /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng, +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) return; +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + kernel_branched_path_surface_connect_light(kg, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + 1); + return; + } +#endif + /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } #endif } -#endif /* path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - ccl_addr_space RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, - PathRadiance *L, + PathRadianceState *L_state, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(ccl_fetch(sd, flag) & SD_BSDF) { + if(sd->flag & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, @@ -255,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); /* set labels */ if(!(label & LABEL_TRANSPARENT)) { @@ -270,16 +307,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif @@ -291,21 +328,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return true; } #ifdef __VOLUME__ - else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { + else if(sd->flag & SD_HAS_ONLY_VOLUME) { /* no surface shader but have a volume shader? act transparent */ /* update path state, count as transparent */ path_state_next(kg, state, LABEL_TRANSPARENT); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, -sd->Ng); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 3d3b7385d8b..b6a856baf24 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -20,11 +20,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, - PathState *state, + ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ @@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light( return; /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -42,24 +40,22 @@ ccl_device_inline void kernel_path_volume_connect_light( bool is_lamp; /* connect to light from given point where shader has been evaluated */ -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } -#endif +#endif /* __EMISSION__ */ } #ifdef __KERNEL_GPU__ @@ -67,8 +63,13 @@ ccl_device_noinline #else ccl_device #endif -bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +bool kernel_path_volume_bounce( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadianceState *L_state, + ccl_addr_space Ray *ray) { /* sample phase function */ float phase_pdf; @@ -76,7 +77,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, float3 phase_omega_in; differential3 phase_domega_in; float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v); int label; label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, @@ -86,7 +87,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label); /* set labels */ state->ray_pdf = phase_pdf; @@ -111,9 +112,17 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, return true; } -ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L, - bool sample_all_lights, Ray *ray, const VolumeSegment *segment) +#ifndef __SPLIT_KERNEL__ +ccl_device void kernel_branched_path_volume_connect_light( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + float3 throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + bool sample_all_lights, + Ray *ray, + const VolumeSegment *segment) { #ifdef __EMISSION__ if(!kernel_data.integrator.use_direct_light) @@ -123,9 +132,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG BsdfEval L_light; bool is_lamp; -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif if(sample_all_lights) { /* lamp sampling */ @@ -135,12 +142,12 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG int num_samples = light_select_num_samples(kg, i); float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { /* sample random position on given light */ float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); @@ -148,28 +155,26 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { if(kernel_data.integrator.pdf_triangles != 0.0f) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -183,42 +188,39 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG for(int j = 0; j < num_samples; j++) { /* sample random position on random triangle */ - float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -227,44 +229,42 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG } else { /* sample random position on random light */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } } -#endif +#endif /* __EMISSION__ */ } +#endif /* __SPLIT_KERNEL__ */ -#endif +#endif /* __VOLUME_SCATTER__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..cbb2442d1dc 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index cf5614b8a86..e32d4bbbc1b 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -17,12 +17,15 @@ #ifndef __KERNEL_QUEUE_H__ #define __KERNEL_QUEUE_H__ +CCL_NAMESPACE_BEGIN + /* * Queue utility functions for split kernel */ - +#ifdef __KERNEL_OPENCL__ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#endif /* * Enqueue ray index into the queue @@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index( ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */ { /* This thread's queue index. */ - int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number]) + + (queue_number * queue_size); queues[my_queue_index] = ray_index; } @@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index( * is no more ray to allocate to other threads. */ ccl_device int get_ray_index( + KernelGlobals *kg, int thread_index, /* Global thread index. */ int queue_number, /* Queue to operate on. */ ccl_global int *queues, /* Buffer of all queues. */ @@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local( int queue_number, /* Queue in which to enqueue ray index. */ char enqueue_flag, /* True for threads whose ray index has to be enqueued. */ int queuesize, /* queue size. */ - ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics. */ + ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */ ccl_global int *Queue_data, /* Queues. */ ccl_global int *Queue_index) /* To do global queue atomics. */ { - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); /* Get local queue id .*/ unsigned int lqidx; if(enqueue_flag) { - lqidx = atomic_inc(local_queue_atomics); + lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue offset. */ if(lidx == 0) { - *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + *local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number], + *local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue index and enqueue ray. */ if(enqueue_flag) { @@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local( ccl_device unsigned int get_local_queue_index( int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ - ccl_local unsigned int *local_queue_atomics) + ccl_local_param unsigned int *local_queue_atomics) { - int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]); return my_lqidx; } ccl_device unsigned int get_global_per_queue_offset( int queue_number, - ccl_local unsigned int *local_queue_atomics, + ccl_local_param unsigned int *local_queue_atomics, ccl_global int* global_queue_atomics) { - unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number], - local_queue_atomics[queue_number]); + unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number], + local_queue_atomics[queue_number]); return queue_offset; } @@ -116,10 +122,27 @@ ccl_device unsigned int get_global_queue_index( int queue_number, int queuesize, unsigned int lqidx, - ccl_local unsigned int * global_per_queue_offset) + ccl_local_param unsigned int * global_per_queue_offset) { int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; return my_gqidx; } +ccl_device int dequeue_ray_index( + int queue_number, + ccl_global int *queues, + int queue_size, + ccl_global int *queue_index) +{ + int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1; + + if(index < 0) { + return QUEUE_EMPTY_SLOT; + } + + return queues[index + queue_number * queue_size]; +} + +CCL_NAMESPACE_END + #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 2b767da5041..e7a6134b8eb 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -14,222 +14,130 @@ * limitations under the License. */ -#include "kernel_jitter.h" +#include "kernel/kernel_jitter.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN -#ifdef __SOBOL__ - -/* skip initial numbers that are not as well distributed, especially the - * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination */ -#define SOBOL_SKIP 64 - -/* High Dimensional Sobol */ +/* Pseudo random numbers, uncomment this for debugging correlations. Only run + * this single threaded on a CPU for repeatable resutls. */ +//#define __DEBUG_CORRELATION__ -/* van der corput radical inverse */ -ccl_device uint van_der_corput(uint bits) -{ - bits = (bits << 16) | (bits >> 16); - bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); - bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); - bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); - bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); - return bits; -} -/* sobol radical inverse */ -ccl_device uint sobol(uint i) -{ - uint r = 0; - - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) - if(i & 1) - r ^= v; - - return r; -} - -/* inverse of sobol radical inverse */ -ccl_device uint sobol_inverse(uint i) -{ - const uint msb = 1U << 31; - uint r = 0; - - for(uint v = 1; i; i <<= 1, v ^= v << 1) - if(i & msb) - r ^= v; +/* High Dimensional Sobol. + * + * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal + * to classic Van der Corput and Sobol sequences. */ - return r; -} +#ifdef __SOBOL__ -/* multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; uint i = index; - - for(uint j = 0; i; i >>= 1, j++) - if(i & 1) + for(uint j = 0; i; i >>= 1, j++) { + if(i & 1) { result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j); - + } + } return result; } -/* lookup index and x/y coordinate, assumes m is a power of two */ -ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y) -{ - /* shift is constant per frame */ - const uint shift = frame << (m << 1); - const uint sobol_shift = sobol(shift); - /* van der Corput is its own inverse */ - const uint lower = van_der_corput(ex << (32 - m)); - /* need to compensate for ey difference and shift */ - const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */ - const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* only use m upper bits for the index (m is a power of two) */ - const uint sobol_result = delta | (delta >> m); - const uint upper = sobol_inverse(sobol_result); - const uint index = shift | upper | lower; - *x = van_der_corput(index); - *y = sobol_shift ^ sobol_result ^ sobol_lower; - return index; -} +#endif /* __SOBOL__ */ -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) + +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + uint rng_hash, + int sample, int num_samples, + int dimension) { +#ifdef __DEBUG_CORRELATION__ + return (float)drand48(); +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ - int p = *rng + dimension; +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { + /* Correlated multi-jitter. */ + int p = rng_hash + dimension; return cmj_sample_1D(sample, num_samples, p); } #endif -#ifdef __SOBOL_FULL_SCREEN__ - uint result = sobol_dimension(kg, *rng, dimension); - float r = (float)result * (1.0f/(float)0xFFFFFFFF); - return r; -#else - /* compute sobol sequence value using direction vectors */ - uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); +#ifdef __SOBOL__ + /* Sobol sequence value using direction vectors. */ + uint result = sobol_dimension(kg, sample, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); /* Cranly-Patterson rotation using rng seed */ float shift; - /* using the same *rng value to offset seems to give correlation issues, - * we could hash it with the dimension but this has a performance impact, - * we need to find a solution for this */ - if(dimension & 1) - shift = (*rng >> 16) * (1.0f/(float)0xFFFF); - else - shift = (*rng & 0xFFFF) * (1.0f/(float)0xFFFF); + /* Hash rng with dimension to solve correlation issues. + * See T38710, T50116. + */ + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); + shift = tmp_rng * (1.0f/(float)0xFFFFFFFF); return r + shift - floorf(r + shift); #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, + uint rng_hash, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { +#ifdef __DEBUG_CORRELATION__ + *fx = (float)drand48(); + *fy = (float)drand48(); + return; +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ - int p = *rng + dimension; +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { + /* Correlated multi-jitter. */ + int p = rng_hash + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); + return; } - else #endif - { - /* sobol */ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); - } -} - -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy) -{ -#ifdef __SOBOL_FULL_SCREEN__ - uint px, py; - uint bits = 16; /* limits us to 65536x65536 and 65536 samples */ - uint size = 1 << bits; - uint frame = sample; - - *rng = sobol_lookup(bits, frame, x, y, &px, &py); - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - *fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x; - *fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y; - } -#else - *rng = *rng_state; - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); - } +#ifdef __SOBOL__ + /* Sobol. */ + *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1); #endif } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) -{ - /* nothing to do */ -} - -#else - -/* Linear Congruential Generator */ - -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) -{ - /* implicit mod 2^32 */ - rng = (1103515245*(rng) + 12345); - return (float)rng * (1.0f/(float)0xFFFFFFFF); -} - -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy) -{ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); -} - -ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, + int sample, int num_samples, + uint *rng_hash, + int x, int y, + float *fx, float *fy) { /* load state */ - *rng = *rng_state; + *rng_hash = hash_int_2d(x, y); + *rng_hash ^= kernel_data.integrator.seed; - *rng ^= kernel_data.integrator.seed; +#ifdef __DEBUG_CORRELATION__ + srand48(*rng_hash + sample); +#endif if(sample == 0) { *fx = 0.5f; *fy = 0.5f; } else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy); } } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) -{ - /* store state for next sample */ - *rng_state = rng; -} - -#endif - /* Linear Congruential Generator */ ccl_device uint lcg_step_uint(uint *rng) @@ -259,90 +167,110 @@ ccl_device uint lcg_init(uint seed) * dimension to avoid using the same sequence twice. * * For branches in the path we must be careful not to reuse the same number - * in a sequence and offset accordingly. */ - -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) -{ - return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); -} - -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) -{ - /* the rng_offset is not increased for transparent bounces. if we do then - * fully transparent objects can become subtly visible by the different - * sampling patterns used where the transparent object is. - * - * however for some random numbers that will determine if we next bounce - * is transparent we do need to increase the offset to avoid always making - * the same decision */ - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); -} + * in a sequence and offset accordingly. + */ -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, + const ccl_addr_space PathState *state, + int dimension) { - path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); + return path_rng_1D(kg, + state->rng_hash, + state->sample, state->num_samples, + state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, + const ccl_addr_space PathState *state, + int dimension, + float *fx, float *fy) { - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); + path_rng_2D(kg, + state->rng_hash, + state->sample, state->num_samples, + state->rng_offset + dimension, + fx, fy); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); + return path_rng_1D(kg, + rng_hash, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng_hash, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension, + fx, fy); } -/* Utitility functions to get light termination value, since it might not be needed in many cases. */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state) +/* Utitility functions to get light termination value, + * since it might not be needed in many cases. + */ +ccl_device_inline float path_state_rng_light_termination( + KernelGlobals *kg, + const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); + return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches) +ccl_device_inline float path_branched_rng_light_termination( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D(kg, + rng_hash, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches) -{ - /* path is splitting into a branch, adjust so that each branch - * still gets a unique sample from the same sequence */ - state->rng_offset += PRNG_BOUNCE_NUM; - state->sample = state->sample*num_branches + branch; - state->num_samples = state->num_samples*num_branches; -} - -ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(PathState *state, + uint scramble) { - return lcg_init(*rng + state->rng_offset + state->sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } -/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */ - -ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng, - const ccl_addr_space PathState *state, +ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble) { - return lcg_init(*rng + state->rng_offset + state->sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } + ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { - /* implicit mod 2^32 */ + /* Implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 9d5ea53d5d8..d46da189661 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -24,12 +24,12 @@ * */ -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf.h" -#include "closure/emissive.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf.h" +#include "kernel/closure/emissive.h" -#include "svm/svm.h" +#include "kernel/svm/svm.h" CCL_NAMESPACE_BEGIN @@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN #ifdef __OBJECT_MOTION__ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) { - if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) { - ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time); - ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm)); + if(sd->object_flag & SD_OBJECT_MOTION) { + sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); + sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); } else { - ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); - ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); } } #endif @@ -55,103 +55,104 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, const Ray *ray) { #ifdef __INSTANCING__ - ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif - ccl_fetch(sd, type) = isect->type; - ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); + sd->type = isect->type; + sd->flag = 0; + sd->object_flag = kernel_tex_fetch(__object_flag, + sd->object); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - ccl_fetch(sd, time) = ray->time; #endif + sd->time = ray->time; - if(ccl_fetch(sd, type) & PRIMITIVE_VOLUME) { - ccl_fetch(sd, prim) = isect->prim; + if (sd->type & PRIMITIVE_VOLUME) { + sd->prim = isect->prim; } else { - ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); + sd->prim = kernel_tex_fetch(__prim_index, isect->prim); } - - ccl_fetch(sd, ray_length) = isect->t; + sd->ray_length = isect->t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = isect->v; + sd->u = isect->u; + sd->v = isect->v; #endif #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - ccl_fetch(sd, shader) = __float_as_int(curvedata.z); - ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); + sd->shader = __float_as_int(curvedata.z); + sd->P = curve_refine(kg, sd, isect, ray); } else #endif - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ - ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->P = triangle_refine(kg, sd, isect, ray); + sd->Ng = Ng; + sd->N = Ng; /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); #endif } - else if(ccl_fetch(sd, type) & PRIMITIVE_VOLUME) { - ccl_fetch(sd, shader) = kernel_tex_fetch(__vol_shader, ccl_fetch(sd, prim)); + else if(sd->type & PRIMITIVE_VOLUME) { + sd->shader = kernel_tex_fetch(__vol_shader, sd->prim); } else { /* motion triangle */ motion_triangle_shader_setup(kg, sd, isect, ray, false); } - ccl_fetch(sd, I) = -ray->D; + sd->I = -ray->D; - ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } #endif /* backfacing test */ - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); - differential_incoming(&ccl_fetch(sd, dI), ray->dD); - differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); + differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); + differential_incoming(&sd->dI, ray->dD); + differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); #endif } @@ -169,10 +170,11 @@ void shader_setup_from_subsurface( const Intersection *isect, const Ray *ray) { - bool backfacing = sd->flag & SD_BACKFACING; + const bool backfacing = sd->flag & SD_BACKFACING; /* object, matrices, time, ray_length stay the same */ - sd->flag = kernel_tex_fetch(__object_flag, sd->object); + sd->flag = 0; + sd->object_flag = kernel_tex_fetch(__object_flag, sd->object); sd->prim = kernel_tex_fetch(__prim_index, isect->prim); sd->type = isect->type; @@ -192,7 +194,7 @@ void shader_setup_from_subsurface( sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -209,11 +211,11 @@ void shader_setup_from_subsurface( # ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform(kg, sd, &sd->N); - object_normal_transform(kg, sd, &sd->Ng); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } # endif @@ -255,104 +257,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, int lamp) { /* vectors */ - ccl_fetch(sd, P) = P; - ccl_fetch(sd, N) = Ng; - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, I) = I; - ccl_fetch(sd, shader) = shader; + sd->P = P; + sd->N = Ng; + sd->Ng = Ng; + sd->I = I; + sd->shader = shader; if(prim != PRIM_NONE) - ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE; + sd->type = PRIMITIVE_TRIANGLE; else if(lamp != LAMP_NONE) - ccl_fetch(sd, type) = PRIMITIVE_LAMP; + sd->type = PRIMITIVE_LAMP; else - ccl_fetch(sd, type) = PRIMITIVE_NONE; + sd->type = PRIMITIVE_NONE; /* primitive */ #ifdef __INSTANCING__ - ccl_fetch(sd, object) = object; + sd->object = object; #endif /* currently no access to bvh prim index for strand sd->prim*/ - ccl_fetch(sd, prim) = prim; + sd->prim = prim; #ifdef __UV__ - ccl_fetch(sd, u) = u; - ccl_fetch(sd, v) = v; + sd->u = u; + sd->v = v; #endif - ccl_fetch(sd, ray_length) = t; + sd->time = time; + sd->ray_length = t; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - if(ccl_fetch(sd, object) != OBJECT_NONE) { - ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; + if(sd->object != OBJECT_NONE) { + sd->object_flag |= kernel_tex_fetch(__object_flag, + sd->object); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); - ccl_fetch(sd, time) = time; } else if(lamp != LAMP_NONE) { - ccl_fetch(sd, ob_tfm) = lamp_fetch_transform(kg, lamp, false); - ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true); + sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); + sd->ob_itfm = lamp_fetch_transform(kg, lamp, true); #endif } /* transform into world space */ if(object_space) { - object_position_transform_auto(kg, sd, &ccl_fetch(sd, P)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I)); + object_position_transform_auto(kg, sd, &sd->P); + object_normal_transform_auto(kg, sd, &sd->Ng); + sd->N = sd->Ng; + object_dir_transform_auto(kg, sd, &sd->I); } - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) { + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ - if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) { - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_normal_transform_auto(kg, sd, &sd->N); } #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); # ifdef __INSTANCING__ - if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) { - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); } # endif #endif } else { #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(ccl_fetch(sd, prim) != PRIM_NONE) { - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + if(sd->prim != PRIM_NONE) { + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - ccl_fetch(sd, dP) = differential3_zero(); - ccl_fetch(sd, dI) = differential3_zero(); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = differential3_zero(); + sd->dI = differential3_zero(); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -373,7 +377,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, P, Ng, I, shader, object, prim, u, v, 0.0f, 0.5f, - !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED), + !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED), LAMP_NONE); } @@ -382,38 +386,37 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { /* vectors */ - ccl_fetch(sd, P) = ray->D; - ccl_fetch(sd, N) = -ray->D; - ccl_fetch(sd, Ng) = -ray->D; - ccl_fetch(sd, I) = -ray->D; - ccl_fetch(sd, shader) = kernel_data.background.surface_shader; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); -#ifdef __OBJECT_MOTION__ - ccl_fetch(sd, time) = ray->time; -#endif - ccl_fetch(sd, ray_length) = 0.0f; + sd->P = ray->D; + sd->N = -ray->D; + sd->Ng = -ray->D; + sd->I = -ray->D; + sd->shader = kernel_data.background.surface_shader; + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; + sd->time = ray->time; + sd->ray_length = 0.0f; #ifdef __INSTANCING__ - ccl_fetch(sd, object) = PRIM_NONE; + sd->object = PRIM_NONE; #endif - ccl_fetch(sd, prim) = PRIM_NONE; + sd->prim = PRIM_NONE; #ifdef __UV__ - ccl_fetch(sd, u) = 0.0f; - ccl_fetch(sd, v) = 0.0f; + sd->u = 0.0f; + sd->v = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - ccl_fetch(sd, dP) = ray->dD; - differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = ray->dD; + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -429,9 +432,8 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s sd->I = -ray->D; sd->shader = SHADER_NONE; sd->flag = 0; -#ifdef __OBJECT_MOTION__ + sd->object_flag = 0; sd->time = ray->time; -#endif sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ #ifdef __INSTANCING__ @@ -500,25 +502,50 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd) } #endif +/* Defensive sampling. */ + +ccl_device_inline void shader_prepare_closures(ShaderData *sd, + ccl_addr_space PathState *state) +{ + /* We can likely also do defensive sampling at deeper bounces, particularly + * for cases like a perfect mirror but possibly also others. This will need + * a good heuristic. */ + if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) { + float sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sum += sc->sample_weight; + } + } + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sc->sample_weight = max(sc->sample_weight, 0.125f * sum); + } + } + } +} + + /* BSDF */ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf, - int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) + const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - if(i == skip_bsdf) - continue; - - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - if(CLOSURE_IS_BSDF(sc->type)) { + if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); if(bsdf_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval*sc->weight); + bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f); sum_pdf += bsdf_pdf*sc->sample_weight; } @@ -537,8 +564,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float light_pdf, bool use_mis) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -546,7 +573,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f; bsdf_eval_accum(result_eval, sc->type, - eval * sc->weight * mis_weight); + eval * sc->weight, + mis_weight); } } } @@ -575,56 +603,128 @@ void shader_bsdf_eval(KernelGlobals *kg, #endif { float pdf; - _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f); + _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f); if(use_mis) { float weight = power_heuristic(light_pdf, pdf); - bsdf_eval_mul(eval, weight); + bsdf_eval_mis(eval, weight); } } } -ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - float randu, float randv, - BsdfEval *bsdf_eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) +ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, + float *randu) { int sampled = 0; - if(ccl_fetch(sd, num_closure) > 1) { - /* pick a BSDF closure based on sample weights */ + if(sd->num_closure > 1) { + /* Pick a BSDF or based on sample weights. */ float sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); - - if(CLOSURE_IS_BSDF(sc->type)) + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; + } } - float r = ccl_fetch(sd, randb_closure)*sum; - sum = 0.0f; + float r = (*randu)*sum; + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); - if(CLOSURE_IS_BSDF(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + sampled = i; - if(r <= sum) + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } + } - if(sampled == ccl_fetch(sd, num_closure)) { - *pdf = 0.0f; - return LABEL_NONE; + return &sd->closure[sampled]; +} + +ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, + ccl_addr_space float3 *throughput, + float *randu) +{ + int sampled = 0; + + if(sd->num_closure > 1) { + /* Pick a BSDF or BSSRDF or based on sample weights. */ + float sum_bsdf = 0.0f; + float sum_bssrdf = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { + sum_bsdf += sc->sample_weight; + } + else if(CLOSURE_IS_BSSRDF(sc->type)) { + sum_bssrdf += sc->sample_weight; + } } + + float r = (*randu)*(sum_bsdf + sum_bssrdf); + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + if(CLOSURE_IS_BSDF(sc->type)) { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf; + return NULL; + } + else { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf; + sampled = i; + + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; + break; + } + } + + partial_sum = next_sum; + } + } + } + + return &sd->closure[sampled]; +} + +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) +{ + const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); + if(sc == NULL) { + *pdf = 0.0f; + return LABEL_NONE; } - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + /* BSSRDF should already have been handled elsewhere. */ + kernel_assert(CLOSURE_IS_BSDF(sc->type)); int label; float3 eval; @@ -635,9 +735,9 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { float sweight = sc->sample_weight; - _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); + _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight); } } @@ -662,23 +762,23 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd, ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); } } -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) + if(sd->flag & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -687,6 +787,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd) +{ + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) { + sc->sample_weight = 0.0f; + sc->weight = make_float3(0.0f, 0.0f, 0.0f); + } + } +} + ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) { float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd); @@ -701,8 +813,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -715,8 +827,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -729,8 +841,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -743,8 +855,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -753,13 +865,26 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) +{ + float3 N = make_float3(0.0f, 0.0f, 0.0f); + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + N += sc->N*average(sc->weight); + } + + return (is_zero(N))? sd->N : normalize(N); +} + ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc; @@ -768,16 +893,11 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += ccl_fetch(sd, N)*average(sc->weight); + N += sd->N*average(sc->weight); } } - if(is_zero(N)) - N = ccl_fetch(sd, N); - else - N = normalize(N); - - *N_ = N; + *N_ = (is_zero(N))? sd->N : normalize(N); return eval; } @@ -788,8 +908,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type)) { const Bssrdf *bssrdf = (const Bssrdf*)sc; @@ -803,10 +923,10 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); + *N_ = (is_zero(N))? sd->N: normalize(N); if(texture_blur_) - *texture_blur_ = texture_blur/weight_sum; + *texture_blur_ = safe_divide(texture_blur, weight_sum); return eval; } @@ -816,7 +936,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) { - return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); + return emissive_simple_eval(sd->Ng, sd->I); } ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) @@ -824,8 +944,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) float3 eval; eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_EMISSION(sc->type)) eval += emissive_eval(kg, sd, sc)*sc->weight; @@ -840,8 +960,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -852,16 +972,15 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng, - ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) +ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, + ccl_addr_space PathState *state, int path_flag) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = randb; + sd->num_closure = 0; + sd->num_closure_extra = 0; #ifdef __OSL__ if(kg->osl) - OSLShader::eval_surface(kg, sd, state, path_flag, ctx); + OSLShader::eval_surface(kg, sd, state, path_flag); else #endif { @@ -871,29 +990,28 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f)); - bsdf->N = ccl_fetch(sd, N); - ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf); + bsdf->N = sd->N; + sd->flag |= bsdf_diffuse_setup(bsdf); #endif } - if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) { - ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953); + if(sd->flag & SD_BSDF_NEEDS_LCG) { + sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953); } } /* Background Evaluation */ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, - ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) + ccl_addr_space PathState *state, int path_flag) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; #ifdef __SVM__ #ifdef __OSL__ if(kg->osl) { - OSLShader::eval_background(kg, sd, state, path_flag, ctx); + OSLShader::eval_background(kg, sd, state, path_flag); } else #endif @@ -903,8 +1021,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BACKGROUND(sc->type)) eval += sc->weight; @@ -934,7 +1052,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf); if(phase_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval); + bsdf_eval_accum(result_eval, sc->type, eval, 1.0f); sum_pdf += phase_pdf*sc->sample_weight; } @@ -970,17 +1088,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s sum += sc->sample_weight; } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = randu*sum; + float partial_sum = 0.0f; for(sampled = 0; sampled < sd->num_closure; sampled++) { const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_PHASE(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; - if(r <= sum) + if(r <= next_sum) { + /* Rescale to reuse for BSDF direction sample. */ + randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } @@ -1026,16 +1149,16 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, - PathState *state, - VolumeStack *stack, - int path_flag, - ShaderContext ctx) + ccl_addr_space PathState *state, + ccl_addr_space VolumeStack *stack, + int path_flag) { /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ sd->num_closure = 0; sd->num_closure_extra = 0; sd->flag = 0; + sd->object_flag = 0; for(int i = 0; stack[i].shader != SHADER_NONE; i++) { /* setup shaderdata from stack. it's mostly setup already in @@ -1043,11 +1166,12 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, sd->object = stack[i].object; sd->shader = stack[i].shader; - sd->flag &= ~(SD_SHADER_FLAGS|SD_OBJECT_FLAGS); + sd->flag &= ~SD_SHADER_FLAGS; sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag &= ~SD_OBJECT_FLAGS; if(sd->object != OBJECT_NONE) { - sd->flag |= kernel_tex_fetch(__object_flag, sd->object); + sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object); #ifdef __OBJECT_MOTION__ /* todo: this is inefficient for motion blur, we should be @@ -1060,7 +1184,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) { - OSLShader::eval_volume(kg, sd, state, path_flag, ctx); + OSLShader::eval_volume(kg, sd, state, path_flag); } else # endif @@ -1079,17 +1203,16 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, /* Displacement Evaluation */ -ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) +ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; /* this will modify sd->P */ #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) - OSLShader::eval_displacement(kg, sd, ctx); + OSLShader::eval_displacement(kg, sd, state); else # endif { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 2981f6ac566..8a0da6c3b13 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -16,9 +16,118 @@ CCL_NAMESPACE_BEGIN -#ifdef __SHADOW_RECORD_ALL__ +#ifdef __VOLUME__ +typedef struct VolumeState { +# ifdef __SPLIT_KERNEL__ +# else + PathState ps; +# endif +} VolumeState; + +/* Get PathState ready for use for volume stack evaluation. */ +# ifdef __SPLIT_KERNEL__ +ccl_addr_space +# endif +ccl_device_inline PathState *shadow_blocked_volume_path_state( + KernelGlobals *kg, + VolumeState *volume_state, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray) +{ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = + &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState *ps = &volume_state->ps; +# endif + *ps = *state; + /* We are checking for shadow on the "other" side of the surface, so need + * to discard volume we are currently at. + */ + if(dot(sd->Ng, ray->D) < 0.0f) { + kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack); + } + return ps; +} +#endif /* __VOLUME__ */ + +/* Attenuate throughput accordingly to the given intersection event. + * Returns true if the throughput is zero and traversal can be aborted. + */ +ccl_device_forceinline bool shadow_handle_transparent_isect( + KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, +# ifdef __VOLUME__ + ccl_addr_space struct PathState *volume_state, +# endif + Intersection *isect, + Ray *ray, + float3 *throughput) +{ +#ifdef __VOLUME__ + /* Attenuation between last surface and next surface. */ + if(volume_state->volume_stack[0].shader != SHADER_NONE) { + Ray segment_ray = *ray; + segment_ray.t = isect->t; + kernel_volume_shadow(kg, + shadow_sd, + volume_state, + &segment_ray, + throughput); + } +#endif + /* Setup shader data at surface. */ + shader_setup_from_ray(kg, shadow_sd, isect, ray); + /* Attenuation from transparent surface. */ + if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { + path_state_modify_bounce(state, true); + shader_eval_surface(kg, + shadow_sd, + state, + PATH_RAY_SHADOW); + path_state_modify_bounce(state, false); + *throughput *= shader_bsdf_transparency(kg, shadow_sd); + } + /* Stop if all light is blocked. */ + if(is_zero(*throughput)) { + return true; + } +#ifdef __VOLUME__ + /* Exit/enter volume. */ + kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack); +#endif + return false; +} + +/* Special version which only handles opaque shadows. */ +ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const uint visibility, + Ray *ray, + Intersection *isect, + float3 *shadow) +{ + const bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); +#ifdef __VOLUME__ + if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { + /* Apply attenuation from current volume shader. */ + kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + } +#endif + return blocked; +} -/* Shadow function to compute how much light is blocked, CPU variation. +#ifdef __TRANSPARENT_SHADOWS__ +# ifdef __SHADOW_RECORD_ALL__ +/* Shadow function to compute how much light is blocked, * * We trace a single ray. If it hits any opaque surface, or more than a given * number of transparent surfaces is hit, then we consider the geometry to be @@ -36,261 +145,412 @@ CCL_NAMESPACE_BEGIN * or there is a performance increase anyway due to avoiding the need to send * two rays with transparent shadows. * - * This is CPU only because of qsort, and malloc or high stack space usage to - * record all these intersections. */ + * On CPU it'll handle all transparent bounces (by allocating storage for + * intersections when they don't fit into the stack storage). + * + * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this + * is something to be kept an eye on. + */ -#define STACK_MAX_HITS 64 +# define SHADOW_STACK_MAX_HITS 64 -ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow) +/* Actual logic with traversal loop implementation which is free from device + * specific tweaks. + * + * Note that hits array should be as big as max_hits+1. + */ +ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, + ShaderData *sd, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const uint visibility, + Ray *ray, + Intersection *hits, + uint max_hits, + float3 *shadow) { - *shadow = make_float3(1.0f, 1.0f, 1.0f); - - if(ray->t == 0.0f) - return false; - - bool blocked; - - if(kernel_data.integrator.transparent_shadows) { - /* check transparent bounces here, for volume scatter which can do - * lighting before surface path termination is checked */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) - return true; - - /* intersect to find an opaque surface, or record all transparent surface hits */ - Intersection hits_stack[STACK_MAX_HITS]; - Intersection *hits = hits_stack; - const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; - uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; - - /* prefer to use stack but use dynamic allocation if too deep max hits - * we need max_hits + 1 storage space due to the logic in - * scene_intersect_shadow_all which will first store and then check if - * the limit is exceeded */ - if(max_hits + 1 > STACK_MAX_HITS) { - if(kg->transparent_shadow_intersections == NULL) { - kg->transparent_shadow_intersections = - (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1)); + /* Intersect to find an opaque surface, or record all transparent + * surface hits. + */ + uint num_hits; + const bool blocked = scene_intersect_shadow_all(kg, + ray, + hits, + visibility, + max_hits, + &num_hits); +# ifdef __VOLUME__ + VolumeState volume_state; +# endif + /* If no opaque surface found but we did find transparent hits, + * shade them. + */ + if(!blocked && num_hits > 0) { + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + float3 Pend = ray->P + ray->D*ray->t; + float last_t = 0.0f; + int bounce = state->transparent_bounce; + Intersection *isect = hits; +# ifdef __VOLUME__ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); +# endif + sort_intersections(hits, num_hits); + for(int hit = 0; hit < num_hits; hit++, isect++) { + /* Adjust intersection distance for moving ray forward. */ + float new_t = isect->t; + isect->t -= last_t; + /* Skip hit if we did not move forward, step by step raytracing + * would have skipped it as well then. + */ + if(last_t == new_t) { + continue; } - hits = kg->transparent_shadow_intersections; - } - - uint num_hits; - blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits); - - /* if no opaque surface found but we did find transparent hits, shade them */ - if(!blocked && num_hits > 0) { - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float3 Pend = ray->P + ray->D*ray->t; - float last_t = 0.0f; - int bounce = state->transparent_bounce; - Intersection *isect = hits; -#ifdef __VOLUME__ - PathState ps = *state; -#endif - - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); - - for(int hit = 0; hit < num_hits; hit++, isect++) { - /* adjust intersection distance for moving ray forward */ - float new_t = isect->t; - isect->t -= last_t; - - /* skip hit if we did not move forward, step by step raytracing - * would have skipped it as well then */ - if(last_t == new_t) - continue; - - last_t = new_t; - -#ifdef __VOLUME__ - /* attenuation between last surface and next surface */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - Ray segment_ray = *ray; - segment_ray.t = isect->t; - kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput); - } -#endif - - /* setup shader data at surface */ - shader_setup_from_ray(kg, shadow_sd, isect, ray); - - /* attenuation from transparent surface */ - if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { - path_state_modify_bounce(state, true); - shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - path_state_modify_bounce(state, false); - - throughput *= shader_bsdf_transparency(kg, shadow_sd); - } - - /* stop if all light is blocked */ - if(is_zero(throughput)) { - return true; - } - - /* move ray forward */ - ray->P = shadow_sd->P; - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - + last_t = new_t; + /* Attenuate the throughput. */ + if(shadow_handle_transparent_isect(kg, + shadow_sd, + state, #ifdef __VOLUME__ - /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack); + ps, #endif - - bounce++; + isect, + ray, + &throughput)) + { + return true; } - -#ifdef __VOLUME__ - /* attenuation for last line segment towards light */ - if(ps.volume_stack[0].shader != SHADER_NONE) - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); -#endif - - *shadow = throughput; - - return is_zero(throughput); + /* Move ray forward. */ + ray->P = shadow_sd->P; + if(ray->t != FLT_MAX) { + ray->D = normalize_len(Pend - ray->P, &ray->t); + } + bounce++; } +# ifdef __VOLUME__ + /* Attenuation for last line segment towards light. */ + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); + } +# endif + *shadow = throughput; + return is_zero(throughput); } - else { - Intersection isect; - blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); - } - -#ifdef __VOLUME__ +# ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* apply attenuation from current volume shader */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + /* Apply attenuation from current volume shader. */ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } -#endif - +# endif return blocked; } -#undef STACK_MAX_HITS - -#else +/* Here we do all device specific trickery before invoking actual traversal + * loop to help readability of the actual logic. + */ +ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, + ShaderData *sd, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const uint visibility, + Ray *ray, + uint max_hits, + float3 *shadow) +{ +# ifdef __SPLIT_KERNEL__ + Intersection hits_[SHADOW_STACK_MAX_HITS]; + Intersection *hits = &hits_[0]; +# elif defined(__KERNEL_CUDA__) + Intersection *hits = kg->hits_stack; +# else + Intersection hits_stack[SHADOW_STACK_MAX_HITS]; + Intersection *hits = hits_stack; +# endif +# ifndef __KERNEL_GPU__ + /* Prefer to use stack but use dynamic allocation if too deep max hits + * we need max_hits + 1 storage space due to the logic in + * scene_intersect_shadow_all which will first store and then check if + * the limit is exceeded. + * + * Ignore this on GPU because of slow/unavailable malloc(). + */ + if(max_hits + 1 > SHADOW_STACK_MAX_HITS) { + if(kg->transparent_shadow_intersections == NULL) { + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + kg->transparent_shadow_intersections = + (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1)); + } + hits = kg->transparent_shadow_intersections; + } +# endif /* __KERNEL_GPU__ */ + /* Invoke actual traversal. */ + return shadow_blocked_transparent_all_loop(kg, + sd, + shadow_sd, + state, + visibility, + ray, + hits, + max_hits, + shadow); +} +# endif /* __SHADOW_RECORD_ALL__ */ -/* Shadow function to compute how much light is blocked, GPU variation. +# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__) +/* Shadow function to compute how much light is blocked, * * Here we raytrace from one transparent surface to the next step by step. * To minimize overhead in cases where we don't need transparent shadows, we * first trace a regular shadow ray. We check if the hit primitive was * potentially transparent, and only in that case start marching. this gives - * one extra ray cast for the cases were we do want transparency. */ + * one extra ray cast for the cases were we do want transparency. + */ -ccl_device_noinline bool shadow_blocked(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray_input, - float3 *shadow) +/* This function is only implementing device-independent traversal logic + * which requires some precalculation done. + */ +ccl_device bool shadow_blocked_transparent_stepped_loop( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const uint visibility, + Ray *ray, + Intersection *isect, + const bool blocked, + const bool is_transparent_isect, + float3 *shadow) { - *shadow = make_float3(1.0f, 1.0f, 1.0f); - - if(ray_input->t == 0.0f) - return false; - -#ifdef __SPLIT_KERNEL__ - Ray private_ray = *ray_input; - Ray *ray = &private_ray; -#else - Ray *ray = ray_input; -#endif - -#ifdef __SPLIT_KERNEL__ - Intersection *isect = &kg->isect_shadow[SD_THREAD]; -#else - Intersection isect_object; - Intersection *isect = &isect_object; -#endif - - bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); - -#ifdef __TRANSPARENT_SHADOWS__ - if(blocked && kernel_data.integrator.transparent_shadows) { - if(shader_transparent_shadow(kg, isect)) { - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float3 Pend = ray->P + ray->D*ray->t; - int bounce = state->transparent_bounce; -#ifdef __VOLUME__ - PathState ps = *state; -#endif - - for(;;) { - if(bounce >= kernel_data.integrator.transparent_max_bounce) - return true; - - if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) - { +# ifdef __VOLUME__ + VolumeState volume_state; +# endif + if(blocked && is_transparent_isect) { + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + float3 Pend = ray->P + ray->D*ray->t; + int bounce = state->transparent_bounce; +# ifdef __VOLUME__ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); +# endif + for(;;) { + if(bounce >= kernel_data.integrator.transparent_max_bounce) { + return true; + } + if(!scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_TRANSPARENT, + isect, + NULL, + 0.0f, 0.0f)) + { + break; + } + if(!shader_transparent_shadow(kg, isect)) { + return true; + } + /* Attenuate the throughput. */ + if(shadow_handle_transparent_isect(kg, + shadow_sd, + state, #ifdef __VOLUME__ - /* attenuation for last line segment towards light */ - if(ps.volume_stack[0].shader != SHADER_NONE) - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); + ps, #endif + isect, + ray, + &throughput)) + { + return true; + } + /* Move ray forward. */ + ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); + if(ray->t != FLT_MAX) { + ray->D = normalize_len(Pend - ray->P, &ray->t); + } + bounce++; + } +# ifdef __VOLUME__ + /* Attenuation for last line segment towards light. */ + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); + } +# endif + *shadow *= throughput; + return is_zero(throughput); + } +# ifdef __VOLUME__ + if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { + /* Apply attenuation from current volume shader. */ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); + } +# endif + return blocked; +} - *shadow *= throughput; - - return false; - } +ccl_device bool shadow_blocked_transparent_stepped( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const uint visibility, + Ray *ray, + Intersection *isect, + float3 *shadow) +{ + bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); + bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, isect) + : false; + return shadow_blocked_transparent_stepped_loop(kg, + sd, + shadow_sd, + state, + visibility, + ray, + isect, + blocked, + is_transparent_isect, + shadow); +} - if(!shader_transparent_shadow(kg, isect)) { - return true; - } +# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ +#endif /* __TRANSPARENT_SHADOWS__ */ -#ifdef __VOLUME__ - /* attenuation between last surface and next surface */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - Ray segment_ray = *ray; - segment_ray.t = isect->t; - kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput); - } +ccl_device_inline bool shadow_blocked(KernelGlobals *kg, + ShaderData *sd, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray_input, + float3 *shadow) +{ + Ray *ray = ray_input; + Intersection isect; + /* Some common early checks. */ + *shadow = make_float3(1.0f, 1.0f, 1.0f); + if(ray->t == 0.0f) { + return false; + } +#ifdef __SHADOW_TRICKS__ + const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) + ? PATH_RAY_SHADOW_NON_CATCHER + : PATH_RAY_SHADOW; +#else + const uint visibility = PATH_RAY_SHADOW; #endif - - /* setup shader data at surface */ - shader_setup_from_ray(kg, shadow_sd, isect, ray); - - /* attenuation from transparent surface */ - if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) { - path_state_modify_bounce(state, true); - shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - path_state_modify_bounce(state, false); - - throughput *= shader_bsdf_transparency(kg, shadow_sd); - } - - /* stop if all light is blocked */ - if(is_zero(throughput)) { - return true; - } - - /* move ray forward */ - ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng)); - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - -#ifdef __VOLUME__ - /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack); + /* Do actual shadow shading. */ + /* First of all, we check if integrator requires transparent shadows. + * if not, we use simplest and fastest ever way to calculate occlusion. + */ +#ifdef __TRANSPARENT_SHADOWS__ + if(!kernel_data.integrator.transparent_shadows) #endif - - bounce++; - } - } + { + return shadow_blocked_opaque(kg, + shadow_sd, + state, + visibility, + ray, + &isect, + shadow); } -#ifdef __VOLUME__ - else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* apply attenuation from current volume shader */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); +#ifdef __TRANSPARENT_SHADOWS__ +# ifdef __SHADOW_RECORD_ALL__ + /* For the transparent shadows we try to use record-all logic on the + * devices which supports this. + */ + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + /* Check transparent bounces here, for volume scatter which can do + * lighting before surface path termination is checked. + */ + if(state->transparent_bounce >= transparent_max_bounce) { + return true; } -#endif -#endif - - return blocked; + const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; +# ifdef __KERNEL_GPU__ + /* On GPU we do trickey with tracing opaque ray first, this avoids speed + * regressions in some files. + * + * TODO(sergey): Check why using record-all behavior causes slowdown in such + * cases. Could that be caused by a higher spill pressure? + */ + const bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + &isect, + NULL, + 0.0f, 0.0f); + const bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, &isect) + : false; + if(!blocked || !is_transparent_isect || + max_hits + 1 >= SHADOW_STACK_MAX_HITS) + { + return shadow_blocked_transparent_stepped_loop(kg, + sd, + shadow_sd, + state, + visibility, + ray, + &isect, + blocked, + is_transparent_isect, + shadow); + } +# endif /* __KERNEL_GPU__ */ + return shadow_blocked_transparent_all(kg, + sd, + shadow_sd, + state, + visibility, + ray, + max_hits, + shadow); +# else /* __SHADOW_RECORD_ALL__ */ + /* Fallback to a slowest version which works on all devices. */ + return shadow_blocked_transparent_stepped(kg, + sd, + shadow_sd, + state, + visibility, + ray, + &isect, + shadow); +# endif /* __SHADOW_RECORD_ALL__ */ +#endif /* __TRANSPARENT_SHADOWS__ */ } -#endif +#undef SHADOW_STACK_MAX_HITS CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 52c05b85aee..23a09e5e2ca 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN * - try to reduce one sample model variance */ -#define BSSRDF_MULTI_EVAL - -ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability) -{ - /* sum sample weights of bssrdf and bsdf */ - float bsdf_sum = 0.0f; - float bssrdf_sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSDF(sc->type)) - bsdf_sum += sc->sample_weight; - else if(CLOSURE_IS_BSSRDF(sc->type)) - bssrdf_sum += sc->sample_weight; - } - - /* use bsdf or bssrdf? */ - float r = sd->randb_closure*(bsdf_sum + bssrdf_sum); - - if(r < bsdf_sum) { - /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */ - sd->randb_closure = r/bsdf_sum; - *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f; - return NULL; - } - - /* use bssrdf */ - r -= bsdf_sum; - - float sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - sum += sc->sample_weight; - - if(r <= sum) { - sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight; - -#ifdef BSSRDF_MULTI_EVAL - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f; -#else - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f; -#endif - return sc; - } - } - } - - /* should never happen */ - sd->randb_closure = 0.0f; - *probability = 1.0f; - return NULL; -} - ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, float disk_r, float r, bool all) { -#ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f); float pdf_sum = 0.0f; - float sample_weight_sum = 0.0f; - int num_bssrdf = 0; + float sample_weight_inv = 0.0f; - for(int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - float sample_weight = (all)? 1.0f: sc->sample_weight; - sample_weight_sum += sample_weight; + if(!all) { + float sample_weight_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + sc = &sd->closure[i]; + + if(CLOSURE_IS_BSSRDF(sc->type)) { + sample_weight_sum += sc->sample_weight; + } } - } - float sample_weight_inv = 1.0f/sample_weight_sum; + sample_weight_inv = 1.0f/sample_weight_sum; + } for(int i = 0; i < sd->num_closure; i++) { sc = &sd->closure[i]; @@ -125,38 +69,49 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, /* TODO power heuristic is not working correct here */ eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf; pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf; - - num_bssrdf++; } } return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f); -#else - float pdf = bssrdf_pdf(pick_sc, r); - float disk_pdf = bssrdf_pdf(pick_sc, disk_r); - - return pick_sc->weight * pdf / disk_pdf; -#endif } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; - sd->randb_closure = 0.0f; sd->num_closure = 0; sd->num_closure_extra = 0; if(hit) { - DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); - - if(bsdf) { - bsdf->N = N; - sd->flag |= bsdf_diffuse_setup(bsdf); - - /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes - * can recognize it as not being a regular diffuse closure */ - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + Bssrdf *bssrdf = (Bssrdf *)sc; +#ifdef __PRINCIPLED__ + if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = bssrdf->roughness; + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular Disney principled diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + } + else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) || + CLOSURE_IS_BSSRDF(bssrdf->type)) +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + sd->flag |= bsdf_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } } } } @@ -185,7 +140,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent) ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, int state_flag, float3 *eval, float3 *N) @@ -199,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, if(bump || texture_blur > 0.0f) { /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS); + shader_eval_surface(kg, sd, state, state_flag); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL); /* we simply divide out the average color and multiply with the average @@ -222,7 +177,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( KernelGlobals *kg, SubsurfaceIntersection *ss_isect, ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, @@ -235,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - /* reusing variable for picking the closure gives a bit nicer stratification - * for path tracer, for branched we do all closures so it doesn't help */ - float axisu = (all)? disk_u: sd->randb_closure; - - if(axisu < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; - if(all) - disk_u *= 2.0f; + disk_u *= 2.0f; } - else if(axisu < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; - if(all) - disk_u = (disk_u - 0.5f)*4.0f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -263,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; - if(all) - disk_u = (disk_u - 0.75f)*4.0f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ @@ -277,7 +225,12 @@ ccl_device_inline int subsurface_scatter_multi_intersect( float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; /* create ray */ +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else Ray *ray = &ss_isect->ray; +#endif ray->P = sd->P + disk_N*disk_height + disk_P; ray->D = -disk_N; ray->t = 2.0f*disk_height; @@ -288,7 +241,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( /* intersect with the same object. if multiple intersections are found it * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ scene_intersect_subsurface(kg, - ray, + *ray, ss_isect, sd->object, lcg_state, @@ -298,20 +251,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( for(int hit = 0; hit < num_eval_hits; hit++) { /* Quickly retrieve P and Ng without setting up ShaderData. */ float3 hit_P; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { hit_P = triangle_refine_subsurface(kg, sd, &ss_isect->hits[hit], ray); } #ifdef __OBJECT_MOTION__ - else if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) { + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { float3 verts[3]; motion_triangle_vertices( kg, - ccl_fetch(sd, object), + sd->object, kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim), - ccl_fetch(sd, time), + sd->time, verts); hit_P = motion_triangle_refine_subsurface(kg, sd, @@ -351,6 +304,10 @@ ccl_device_inline int subsurface_scatter_multi_intersect( ss_isect->weight[hit] = eval; } +#ifdef __SPLIT_KERNEL__ + ss_isect->ray = *ray; +#endif + return num_eval_hits; } @@ -359,13 +316,25 @@ ccl_device_noinline void subsurface_scatter_multi_setup( SubsurfaceIntersection* ss_isect, int hit, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, int state_flag, - ShaderClosure *sc, + const ShaderClosure *sc, bool all) { +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else + Ray *ray = &ss_isect->ray; +#endif + + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif + /* Setup new shading point. */ - shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray); + shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); /* Optionally blur colors and bump mapping. */ float3 weight = ss_isect->weight[hit]; @@ -373,12 +342,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup( subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N); /* Setup diffuse BSDF. */ - subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N); } /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, - int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) +ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, + int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -389,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - if(sd->randb_closure < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; + disk_u *= 2.0f; } - else if(sd->randb_closure < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -409,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ @@ -432,12 +404,16 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS /* intersect with the same object. if multiple intersections are * found it will randomly pick one of them */ SubsurfaceIntersection ss_isect; - scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1); + scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1); /* evaluate bssrdf */ if(ss_isect.num_hits > 0) { float3 origP = sd->P; + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif /* setup new shading point */ shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray); @@ -463,7 +439,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N); /* setup diffuse bsdf */ - subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 65aeea18336..c8e54954a84 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -15,7 +15,7 @@ */ #ifndef KERNEL_TEX -# define KERNEL_TEX(type, ttype, name) +# define KERNEL_TEX(type, name) #endif #ifndef KERNEL_IMAGE_TEX @@ -23,177 +23,169 @@ #endif /* bvh */ -KERNEL_TEX(float4, texture_float4, __bvh_nodes) -KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) -KERNEL_TEX(float4, texture_float4, __prim_tri_verts) -KERNEL_TEX(uint, texture_uint, __prim_tri_index) -KERNEL_TEX(uint, texture_uint, __prim_type) -KERNEL_TEX(uint, texture_uint, __prim_visibility) -KERNEL_TEX(uint, texture_uint, __prim_index) -KERNEL_TEX(uint, texture_uint, __prim_object) -KERNEL_TEX(uint, texture_uint, __object_node) +KERNEL_TEX(float4, __bvh_nodes) +KERNEL_TEX(float4, __bvh_leaf_nodes) +KERNEL_TEX(float4, __prim_tri_verts) +KERNEL_TEX(uint, __prim_tri_index) +KERNEL_TEX(uint, __prim_type) +KERNEL_TEX(uint, __prim_visibility) +KERNEL_TEX(uint, __prim_index) +KERNEL_TEX(uint, __prim_object) +KERNEL_TEX(uint, __object_node) +KERNEL_TEX(float2, __prim_time) /* objects */ -KERNEL_TEX(float4, texture_float4, __objects) -KERNEL_TEX(float4, texture_float4, __objects_vector) +KERNEL_TEX(float4, __objects) +KERNEL_TEX(float4, __objects_vector) /* triangles */ -KERNEL_TEX(uint, texture_uint, __tri_shader) -KERNEL_TEX(float4, texture_float4, __tri_vnormal) -KERNEL_TEX(uint4, texture_uint4, __tri_vindex) -KERNEL_TEX(uint, texture_uint, __tri_patch) -KERNEL_TEX(float2, texture_float2, __tri_patch_uv) +KERNEL_TEX(uint, __tri_shader) +KERNEL_TEX(float4, __tri_vnormal) +KERNEL_TEX(uint4, __tri_vindex) +KERNEL_TEX(uint, __tri_patch) +KERNEL_TEX(float2, __tri_patch_uv) /* curves */ -KERNEL_TEX(float4, texture_float4, __curves) -KERNEL_TEX(float4, texture_float4, __curve_keys) +KERNEL_TEX(float4, __curves) +KERNEL_TEX(float4, __curve_keys) /* patches */ -KERNEL_TEX(uint, texture_uint, __patches) +KERNEL_TEX(uint, __patches) /* attributes */ -KERNEL_TEX(uint4, texture_uint4, __attributes_map) -KERNEL_TEX(float, texture_float, __attributes_float) -KERNEL_TEX(float4, texture_float4, __attributes_float3) -KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4) +KERNEL_TEX(uint4, __attributes_map) +KERNEL_TEX(float, __attributes_float) +KERNEL_TEX(float4, __attributes_float3) +KERNEL_TEX(uchar4, __attributes_uchar4) /* lights */ -KERNEL_TEX(float4, texture_float4, __light_distribution) -KERNEL_TEX(float4, texture_float4, __light_data) -KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf) -KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf) +KERNEL_TEX(float4, __light_distribution) +KERNEL_TEX(float4, __light_data) +KERNEL_TEX(float2, __light_background_marginal_cdf) +KERNEL_TEX(float2, __light_background_conditional_cdf) /* particles */ -KERNEL_TEX(float4, texture_float4, __particles) +KERNEL_TEX(float4, __particles) /* shaders */ -KERNEL_TEX(uint4, texture_uint4, __svm_nodes) -KERNEL_TEX(uint, texture_uint, __shader_flag) -KERNEL_TEX(uint, texture_uint, __object_flag) +KERNEL_TEX(uint4, __svm_nodes) +KERNEL_TEX(uint, __shader_flag) +KERNEL_TEX(uint, __object_flag) /* lookup tables */ -KERNEL_TEX(float, texture_float, __lookup_table) +KERNEL_TEX(float, __lookup_table) /* sobol */ -KERNEL_TEX(uint, texture_uint, __sobol_directions) +KERNEL_TEX(uint, __sobol_directions) /* volume */ -KERNEL_TEX(uint, texture_uint, __vol_shader) +KERNEL_TEX(uint, __vol_shader) -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ < 300 +#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ >= 300 +/* image textures */ +KERNEL_TEX(TextureInfo, __texture_info) +#else /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) - -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032) + +/* image + * These texture names are encoded to their flattened slots as + * ImageManager::type_index_to_flattened_slot() returns them. */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) - -# else -/* bindless textures */ -KERNEL_TEX(uint, texture_uint, __bindless_mapping) -# endif -#endif - -/* packed image (opencl) */ -KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) -KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed) -KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed) -KERNEL_TEX(float, texture_float, __tex_image_float_packed) -KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) +#endif /* defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 */ #undef KERNEL_TEX #undef KERNEL_IMAGE_TEX diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 15960dba40d..6c8e1c4e336 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -17,9 +17,9 @@ #ifndef __KERNEL_TYPES_H__ #define __KERNEL_TYPES_H__ -#include "kernel_math.h" -#include "svm/svm_types.h" -#include "util_static_assert.h" +#include "kernel/kernel_math.h" +#include "kernel/svm/svm_types.h" +#include "util/util_static_assert.h" #ifndef __KERNEL_GPU__ # define __KERNEL_CPU__ @@ -56,6 +56,26 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 16 +#define WORK_POOL_SIZE_GPU 64 +#define WORK_POOL_SIZE_CPU 1 +#ifdef __KERNEL_GPU__ +# define WORK_POOL_SIZE WORK_POOL_SIZE_GPU +#else +# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU +#endif + + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -70,23 +90,28 @@ CCL_NAMESPACE_BEGIN # ifdef WITH_OPENVDB # define __OPENVDB__ # endif +# define __PRINCIPLED__ # define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ -# define __VOLUME_DECOUPLED__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ +# define __VOLUME_DECOUPLED__ # define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SUBSURFACE__ +# define __PRINCIPLED__ +# define __SHADOW_RECORD_ALL__ # define __CMJ__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# endif #endif /* __KERNEL_CUDA__ */ #ifdef __KERNEL_OPENCL__ @@ -96,36 +121,45 @@ CCL_NAMESPACE_BEGIN # ifdef __KERNEL_OPENCL_NVIDIA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __SUBSURFACE__ +# define __PRINCIPLED__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_NVIDIA__ */ # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ +# define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support * custom cflags depending on the scene features. */ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif -# endif /* __KERNEL_OPENCL_NVIDIA__ */ +# endif /* __KERNEL_OPENCL_APPLE__ */ # ifdef __KERNEL_OPENCL_AMD__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __SUBSURFACE__ +# define __PRINCIPLED__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __PRINCIPLED__ +# define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ #endif /* __KERNEL_OPENCL__ */ @@ -143,6 +177,9 @@ CCL_NAMESPACE_BEGIN #define __INTERSECTION_REFINE__ #define __CLAMP_SAMPLE__ #define __PATCH_EVAL__ +#define __SHADOW_TRICKS__ + +#define __DENOISING_FEATURES__ #ifdef __KERNEL_SHADING__ # define __SVM__ @@ -195,10 +232,18 @@ CCL_NAMESPACE_BEGIN #ifdef __NO_PATCH_EVAL__ # undef __PATCH_EVAL__ #endif - -/* Random Numbers */ - -typedef uint RNG; +#ifdef __NO_TRANSPARENT__ +# undef __TRANSPARENT_SHADOWS__ +#endif +#ifdef __NO_SHADOW_TRICKS__ +# undef __SHADOW_TRICKS__ +#endif +#ifdef __NO_PRINCIPLED__ +# undef __PRINCIPLED__ +#endif +#ifdef __NO_DENOISING__ +# undef __DENOISING_FEATURES__ +#endif /* Shader Evaluation */ @@ -239,31 +284,21 @@ enum PathTraceDimension { PRNG_FILTER_V = 1, PRNG_LENS_U = 2, PRNG_LENS_V = 3, -#ifdef __CAMERA_MOTION__ PRNG_TIME = 4, PRNG_UNUSED_0 = 5, PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */ PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */ -#endif - PRNG_BASE_NUM = 8, + PRNG_BASE_NUM = 10, PRNG_BSDF_U = 0, PRNG_BSDF_V = 1, - PRNG_BSDF = 2, - PRNG_LIGHT = 3, - PRNG_LIGHT_U = 4, - PRNG_LIGHT_V = 5, - PRNG_LIGHT_TERMINATE = 6, - PRNG_TERMINATE = 7, - -#ifdef __VOLUME__ - PRNG_PHASE_U = 8, - PRNG_PHASE_V = 9, - PRNG_PHASE = 10, - PRNG_SCATTER_DISTANCE = 11, -#endif - - PRNG_BOUNCE_NUM = 12, + PRNG_LIGHT_U = 2, + PRNG_LIGHT_V = 3, + PRNG_LIGHT_TERMINATE = 4, + PRNG_TERMINATE = 5, + PRNG_PHASE_CHANNEL = 6, + PRNG_SCATTER_DISTANCE = 7, + PRNG_BOUNCE_NUM = 8, }; enum SamplingPattern { @@ -276,29 +311,36 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, - PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7), + PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8), + PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER), + PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9), + PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10), + PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER), + PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER), + PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), + + PATH_RAY_CURVE = (1 << 11), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, + PATH_RAY_NODE_UNALIGNED = (1 << 13), - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1), - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, + PATH_RAY_MIS_SKIP = (1 << 15), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 16), + PATH_RAY_SINGLE_PASS_DONE = (1 << 17), + PATH_RAY_SHADOW_CATCHER = (1 << 18), + PATH_RAY_STORE_SHADOW_INFO = (1 << 19), }; /* Closure Label */ @@ -345,14 +387,31 @@ typedef enum PassType { PASS_SUBSURFACE_COLOR = (1 << 24), PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */ #ifdef __KERNEL_DEBUG__ - PASS_BVH_TRAVERSAL_STEPS = (1 << 26), + PASS_BVH_TRAVERSED_NODES = (1 << 26), PASS_BVH_TRAVERSED_INSTANCES = (1 << 27), - PASS_RAY_BOUNCES = (1 << 28), + PASS_BVH_INTERSECTIONS = (1 << 28), + PASS_RAY_BOUNCES = (1 << 29), #endif } PassType; #define PASS_ALL (~0) +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + typedef enum BakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), @@ -386,18 +445,54 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + +#ifdef __KERNEL_DEBUG__ +/* NOTE: This is a runtime-only struct, alignment is not + * really important here. + */ +typedef struct DebugData { + int num_bvh_traversed_nodes; + int num_bvh_traversed_instances; + int num_bvh_intersections; + int num_ray_bounces; +} DebugData; +#endif + +typedef ccl_addr_space struct PathRadianceState { +#ifdef __PASSES__ + float3 diffuse; + float3 glossy; + float3 transmission; + float3 subsurface; + float3 scatter; + + float3 direct; +#endif +} PathRadianceState; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; #endif + float transparent; float3 emission; #ifdef __PASSES__ float3 background; float3 ao; float3 indirect; - float3 direct_throughput; float3 direct_emission; float3 color_diffuse; @@ -418,15 +513,46 @@ typedef ccl_addr_space struct PathRadiance { float3 indirect_subsurface; float3 indirect_scatter; - float3 path_diffuse; - float3 path_glossy; - float3 path_transmission; - float3 path_subsurface; - float3 path_scatter; - float4 shadow; float mist; #endif + + struct PathRadianceState state; + +#ifdef __SHADOW_TRICKS__ + /* Total light reachable across the path, ignoring shadow blocked queries. */ + float3 path_total; + /* Total light reachable across the path with shadow blocked queries + * applied here. + * + * Dividing this figure by path_total will give estimate of shadow pass. + */ + float3 path_total_shaded; + + /* Color of the background on which shadow is alpha-overed. */ + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float shadow_throughput; + + /* Accumulated transparency along the path after shadow catcher bounce. */ + float shadow_transparency; + + /* Indicate if any shadow catcher data is set. */ + int has_shadow_catcher; +#endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; +#endif /* __KERNEL_DEBUG__ */ } PathRadiance; typedef struct BsdfEval { @@ -442,6 +568,9 @@ typedef struct BsdfEval { float3 subsurface; float3 scatter; #endif +#ifdef __SHADOW_TRICKS__ + float3 sum_no_mis; +#endif } BsdfEval; /* Shader Flag */ @@ -535,29 +664,32 @@ typedef struct Ray { /* Intersection */ -typedef ccl_addr_space struct Intersection { +typedef struct Intersection { float t, u, v; int prim; int object; int type; #ifdef __KERNEL_DEBUG__ - int num_traversal_steps; + int num_traversed_nodes; int num_traversed_instances; + int num_intersections; #endif } Intersection; /* Primitives */ typedef enum PrimitiveType { - PRIMITIVE_NONE = 0, - PRIMITIVE_TRIANGLE = 1, - PRIMITIVE_MOTION_TRIANGLE = 2, - PRIMITIVE_CURVE = 4, - PRIMITIVE_MOTION_CURVE = 8, - /* Lamp primitive is not included below on purpose, since it is no real traceable primitive */ - PRIMITIVE_LAMP = 16, - PRIMITIVE_VOLUME = 32, + PRIMITIVE_NONE = 0, + PRIMITIVE_TRIANGLE = (1 << 0), + PRIMITIVE_MOTION_TRIANGLE = (1 << 1), + PRIMITIVE_CURVE = (1 << 2), + PRIMITIVE_MOTION_CURVE = (1 << 3), + /* Lamp primitive is not included below on purpose, + * since it is no real traceable primitive. + */ + PRIMITIVE_LAMP = (1 << 4), + PRIMITIVE_VOLUME = (1 << 5), PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE), PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE), @@ -565,14 +697,14 @@ typedef enum PrimitiveType { PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME), PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE|PRIMITIVE_ALL_VOLUME), - /* Total number of different primitives. + /* Total number of different traceable primitives. * NOTE: This is an actual value, not a bitflag. */ PRIMITIVE_NUM_TOTAL = 4, } PrimitiveType; -#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type) -#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> 16) +#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type)) +#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL) /* Attributes */ @@ -665,175 +797,197 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; -/* Shader Context - * - * For OSL we recycle a fixed number of contexts for speed */ - -typedef enum ShaderContext { - SHADER_CONTEXT_MAIN = 0, - SHADER_CONTEXT_INDIRECT = 1, - SHADER_CONTEXT_EMISSION = 2, - SHADER_CONTEXT_SHADOW = 3, - SHADER_CONTEXT_SSS = 4, - SHADER_CONTEXT_VOLUME = 5, - SHADER_CONTEXT_NUM = 6 -} ShaderContext; - /* Shader Data * * Main shader state at a point on the surface or in a volume. All coordinates - * are in world space. */ + * are in world space. + */ enum ShaderDataFlag { - /* runtime flags */ - SD_BACKFACING = (1 << 0), /* backside of surface? */ - SD_EMISSION = (1 << 1), /* have emissive closure? */ - SD_BSDF = (1 << 2), /* have bsdf closure? */ - SD_BSDF_HAS_EVAL = (1 << 3), /* have non-singular bsdf closure? */ - SD_BSSRDF = (1 << 4), /* have bssrdf */ - SD_HOLDOUT = (1 << 5), /* have holdout closure? */ - SD_ABSORPTION = (1 << 6), /* have volume absorption closure? */ - SD_SCATTER = (1 << 7), /* have volume phase closure? */ - SD_AO = (1 << 8), /* have ao closure? */ - SD_TRANSPARENT = (1 << 9), /* have transparent closure? */ + /* Runtime flags. */ + + /* Set when ray hits backside of surface. */ + SD_BACKFACING = (1 << 0), + /* Shader has emissive closure. */ + SD_EMISSION = (1 << 1), + /* Shader has BSDF closure. */ + SD_BSDF = (1 << 2), + /* Shader has non-singular BSDF closure. */ + SD_BSDF_HAS_EVAL = (1 << 3), + /* Shader has BSSRDF closure. */ + SD_BSSRDF = (1 << 4), + /* Shader has holdout closure. */ + SD_HOLDOUT = (1 << 5), + /* Shader has volume absorption closure. */ + SD_ABSORPTION = (1 << 6), + /* Shader has have volume phase (scatter) closure. */ + SD_SCATTER = (1 << 7), + /* Shader has AO closure. */ + SD_AO = (1 << 8), + /* Shader has transparent closure. */ + SD_TRANSPARENT = (1 << 9), + /* BSDF requires LCG for evaluation. */ SD_BSDF_NEEDS_LCG = (1 << 10), - SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF| - SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO| + SD_CLOSURE_FLAGS = (SD_EMISSION | + SD_BSDF | + SD_BSDF_HAS_EVAL | + SD_BSSRDF | + SD_HOLDOUT | + SD_ABSORPTION | + SD_SCATTER | + SD_AO | SD_BSDF_NEEDS_LCG), - /* shader flags */ - SD_USE_MIS = (1 << 12), /* direct light sample */ - SD_HAS_TRANSPARENT_SHADOW = (1 << 13), /* has transparent shadow */ - SD_HAS_VOLUME = (1 << 14), /* has volume shader */ - SD_HAS_ONLY_VOLUME = (1 << 15), /* has only volume shader, no surface */ - SD_HETEROGENEOUS_VOLUME = (1 << 16), /* has heterogeneous volume */ - SD_HAS_BSSRDF_BUMP = (1 << 17), /* bssrdf normal uses bump */ - SD_VOLUME_EQUIANGULAR = (1 << 18), /* use equiangular sampling */ - SD_VOLUME_MIS = (1 << 19), /* use multiple importance sampling */ - SD_VOLUME_CUBIC = (1 << 20), /* use cubic interpolation for voxels */ - SD_HAS_BUMP = (1 << 21), /* has data connected to the displacement input */ - SD_HAS_DISPLACEMENT = (1 << 22), /* has true displacement */ - SD_HAS_CONSTANT_EMISSION = (1 << 23), /* has constant emission (value stored in __shader_flag) */ - - SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME| - SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME| - SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS| - SD_VOLUME_CUBIC|SD_HAS_BUMP|SD_HAS_DISPLACEMENT|SD_HAS_CONSTANT_EMISSION), - - /* object flags */ - SD_HOLDOUT_MASK = (1 << 24), /* holdout for camera rays */ - SD_OBJECT_MOTION = (1 << 25), /* has object motion blur */ - SD_TRANSFORM_APPLIED = (1 << 26), /* vertices have transform applied */ - SD_NEGATIVE_SCALE_APPLIED = (1 << 27), /* vertices have negative scale applied */ - SD_OBJECT_HAS_VOLUME = (1 << 28), /* object has a volume shader */ - SD_OBJECT_INTERSECTS_VOLUME = (1 << 29), /* object intersects AABB of an object with volume shader */ - SD_OBJECT_HAS_VERTEX_MOTION = (1 << 30), /* has position for motion vertices */ - - SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED| - SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME| - SD_OBJECT_INTERSECTS_VOLUME) + /* Shader flags. */ + + /* direct light sample */ + SD_USE_MIS = (1 << 16), + /* Has transparent shadow. */ + SD_HAS_TRANSPARENT_SHADOW = (1 << 17), + /* Has volume shader. */ + SD_HAS_VOLUME = (1 << 18), + /* Has only volume shader, no surface. */ + SD_HAS_ONLY_VOLUME = (1 << 19), + /* Has heterogeneous volume. */ + SD_HETEROGENEOUS_VOLUME = (1 << 20), + /* BSSRDF normal uses bump. */ + SD_HAS_BSSRDF_BUMP = (1 << 21), + /* Use equiangular volume sampling */ + SD_VOLUME_EQUIANGULAR = (1 << 22), + /* Use multiple importance volume sampling. */ + SD_VOLUME_MIS = (1 << 23), + /* Use cubic interpolation for voxels. */ + SD_VOLUME_CUBIC = (1 << 24), + /* Has data connected to the displacement input or uses bump map. */ + SD_HAS_BUMP = (1 << 25), + /* Has true displacement. */ + SD_HAS_DISPLACEMENT = (1 << 26), + /* Has constant emission (value stored in __shader_flag) */ + SD_HAS_CONSTANT_EMISSION = (1 << 27), + + SD_SHADER_FLAGS = (SD_USE_MIS | + SD_HAS_TRANSPARENT_SHADOW | + SD_HAS_VOLUME | + SD_HAS_ONLY_VOLUME | + SD_HETEROGENEOUS_VOLUME| + SD_HAS_BSSRDF_BUMP | + SD_VOLUME_EQUIANGULAR | + SD_VOLUME_MIS | + SD_VOLUME_CUBIC | + SD_HAS_BUMP | + SD_HAS_DISPLACEMENT | + SD_HAS_CONSTANT_EMISSION) }; -#ifdef __SPLIT_KERNEL__ -# define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0)) -# if !defined(__SPLIT_KERNEL_SOA__) - /* ShaderData is stored as an Array-of-Structures */ -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (s[SD_THREAD].soa_##t) -# define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index]) -# else - /* ShaderData is stored as an Structure-of-Arrays */ -# define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1)) -# define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t) -# define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0) -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) + SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t) -# define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index]) -# endif -#else -# define ccl_soa_member(type, name) type name -# define ccl_fetch(s, t) (s->t) -# define ccl_fetch_array(s, t, index) (&s->t[index]) -#endif + /* Object flags. */ +enum ShaderDataObjectFlag { + /* Holdout for camera rays. */ + SD_OBJECT_HOLDOUT_MASK = (1 << 0), + /* Has object motion blur. */ + SD_OBJECT_MOTION = (1 << 1), + /* Vertices have transform applied. */ + SD_OBJECT_TRANSFORM_APPLIED = (1 << 2), + /* Vertices have negative scale applied. */ + SD_OBJECT_NEGATIVE_SCALE_APPLIED = (1 << 3), + /* Object has a volume shader. */ + SD_OBJECT_HAS_VOLUME = (1 << 4), + /* Object intersects AABB of an object with volume shader. */ + SD_OBJECT_INTERSECTS_VOLUME = (1 << 5), + /* Has position for motion vertices. */ + SD_OBJECT_HAS_VERTEX_MOTION = (1 << 6), + /* object is used to catch shadows */ + SD_OBJECT_SHADOW_CATCHER = (1 << 7), + + SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK | + SD_OBJECT_MOTION | + SD_OBJECT_TRANSFORM_APPLIED | + SD_OBJECT_NEGATIVE_SCALE_APPLIED | + SD_OBJECT_HAS_VOLUME | + SD_OBJECT_INTERSECTS_VOLUME | + SD_OBJECT_SHADOW_CATCHER) +}; typedef ccl_addr_space struct ShaderData { /* position */ - ccl_soa_member(float3, P); + float3 P; /* smooth normal for shading */ - ccl_soa_member(float3, N); + float3 N; /* true geometric normal */ - ccl_soa_member(float3, Ng); + float3 Ng; /* view/incoming direction */ - ccl_soa_member(float3, I); + float3 I; /* shader id */ - ccl_soa_member(int, shader); + int shader; /* booleans describing shader, see ShaderDataFlag */ - ccl_soa_member(int, flag); + int flag; + /* booleans describing object of the shader, see ShaderDataObjectFlag */ + int object_flag; /* primitive id if there is one, ~0 otherwise */ - ccl_soa_member(int, prim); + int prim; /* combined type and curve segment for hair */ - ccl_soa_member(int, type); + int type; /* parametric coordinates * - barycentric weights for triangles */ - ccl_soa_member(float, u); - ccl_soa_member(float, v); + float u; + float v; /* object id if there is one, ~0 otherwise */ - ccl_soa_member(int, object); + int object; /* motion blur sample time */ - ccl_soa_member(float, time); + float time; /* length of the ray being shaded */ - ccl_soa_member(float, ray_length); + float ray_length; #ifdef __RAY_DIFFERENTIALS__ /* differential of P. these are orthogonal to Ng, not N */ - ccl_soa_member(differential3, dP); + differential3 dP; /* differential of I */ - ccl_soa_member(differential3, dI); + differential3 dI; /* differential of u, v */ - ccl_soa_member(differential, du); - ccl_soa_member(differential, dv); + differential du; + differential dv; #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is * not readily suitable as a tangent for shading on triangles. */ - ccl_soa_member(float3, dPdu); - ccl_soa_member(float3, dPdv); + float3 dPdu; + float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid * re-interpolating them constantly for shading */ - ccl_soa_member(Transform, ob_tfm); - ccl_soa_member(Transform, ob_itfm); + Transform ob_tfm; + Transform ob_itfm; #endif /* Closure data, we store a fixed array of closures */ - ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]); - ccl_soa_member(int, num_closure); - ccl_soa_member(int, num_closure_extra); - ccl_soa_member(float, randb_closure); - ccl_soa_member(float3, svm_closure_weight); + struct ShaderClosure closure[MAX_CLOSURE]; + int num_closure; + int num_closure_extra; + float randb_closure; + float3 svm_closure_weight; /* LCG state for closures that require additional random numbers. */ - ccl_soa_member(uint, lcg_state); + uint lcg_state; /* ray start position, only set for backgrounds */ - ccl_soa_member(float3, ray_P); - ccl_soa_member(differential3, ray_dP); + float3 ray_P; + differential3 ray_dP; #ifdef __OSL__ struct KernelGlobals *osl_globals; @@ -856,9 +1010,11 @@ typedef struct PathState { int flag; /* random number generator state */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ + uint rng_hash; /* per pixel hash */ + int rng_offset; /* dimension offset */ + int sample; /* path sample number */ + int num_samples; /* total number of times this path will be sampled */ + float branch_factor; /* number of branches in indirect paths */ /* bounce counting */ int bounce; @@ -867,6 +1023,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -877,7 +1037,7 @@ typedef struct PathState { /* volume rendering */ #ifdef __VOLUME__ int volume_bounce; - RNG rng_congruential; + uint rng_congruential; VolumeStack volume_stack[VOLUME_STACK_SIZE]; #endif } PathState; @@ -885,29 +1045,25 @@ typedef struct PathState { /* Subsurface */ /* Struct to gather multiple SSS hits. */ -struct SubsurfaceIntersection -{ +typedef struct SubsurfaceIntersection { Ray ray; float3 weight[BSSRDF_MAX_HITS]; int num_hits; struct Intersection hits[BSSRDF_MAX_HITS]; float3 Ng[BSSRDF_MAX_HITS]; -}; +} SubsurfaceIntersection; /* Struct to gather SSS indirect rays and delay tracing them. */ -struct SubsurfaceIndirectRays -{ - bool need_update_volume_stack; - bool tracing; +typedef struct SubsurfaceIndirectRays { PathState state[BSSRDF_MAX_HITS]; - struct PathRadiance direct_L; int num_rays; + struct Ray rays[BSSRDF_MAX_HITS]; float3 throughputs[BSSRDF_MAX_HITS]; - struct PathRadiance L[BSSRDF_MAX_HITS]; -}; + struct PathRadianceState L_state[BSSRDF_MAX_HITS]; +} SubsurfaceIndirectRays; /* Constant Kernel Data * @@ -1040,11 +1196,16 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + int pad; + #ifdef __KERNEL_DEBUG__ - int pass_bvh_traversal_steps; + int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; + int pass_bvh_intersections; int pass_ray_bounces; - int pass_pad3; #endif } KernelFilm; static_assert_align(KernelFilm, 16); @@ -1080,7 +1241,6 @@ typedef struct KernelIntegrator { int portal_offset; /* bounces */ - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -1088,8 +1248,9 @@ typedef struct KernelIntegrator { int max_transmission_bounce; int max_volume_bounce; + int ao_bounces; + /* transparent */ - int transparent_min_bounce; int transparent_max_bounce; int transparent_shadows; @@ -1107,6 +1268,7 @@ typedef struct KernelIntegrator { /* branched path */ int branched; + int volume_decoupled; int diffuse_samples; int glossy_samples; int transmission_samples; @@ -1131,7 +1293,7 @@ typedef struct KernelIntegrator { float light_inv_rr_threshold; - int pad1; + int start_sample; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1143,7 +1305,8 @@ typedef struct KernelBVH { int have_curves; int have_instancing; int use_qbvh; - int pad1, pad2; + int use_bvh_steps; + int pad1; } KernelBVH; static_assert_align(KernelBVH, 16); @@ -1185,19 +1348,6 @@ typedef struct KernelData { } KernelData; static_assert_align(KernelData, 16); -#ifdef __KERNEL_DEBUG__ -/* NOTE: This is a runtime-only struct, alignment is not - * really important here. - */ -typedef ccl_addr_space struct DebugData { - // Total number of BVH node traversal steps and primitives intersections - // for the camera rays. - int num_bvh_traversal_steps; - int num_bvh_traversed_instances; - int num_ray_bounces; -} DebugData; -#endif - /* Declarations required for split kernel */ /* Macro for queues */ @@ -1210,7 +1360,6 @@ typedef ccl_addr_space struct DebugData { * Queue 3 - Shadow ray cast kernel - AO * Queeu 4 - Shadow ray cast kernel - direct lighting */ -#define NUM_QUEUES 4 /* Queue names */ enum QueueNumber { @@ -1223,45 +1372,75 @@ enum QueueNumber { * 3. Rays to be regenerated * are enqueued here. */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contribution for AO are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contributing for direct lighting are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + +#ifdef __BRANCHED_PATH__ + /* All rays moving to next iteration of the indirect loop for light */ + QUEUE_LIGHT_INDIRECT_ITER, + /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ + QUEUE_INACTIVE_RAYS, +# ifdef __VOLUME__ + /* All rays moving to next iteration of the indirect loop for volumes */ + QUEUE_VOLUME_INDIRECT_ITER, +# endif +# ifdef __SUBSURFACE__ + /* All rays moving to next iteration of the indirect loop for subsurface */ + QUEUE_SUBSURFACE_INDIRECT_ITER, +# endif +#endif /* __BRANCHED_PATH__ */ + + NUM_QUEUES }; -/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ -#define RAY_STATE_MASK 0x007 -#define RAY_FLAG_MASK 0x0F8 +/* We use RAY_STATE_MASK to get ray_state */ +#define RAY_STATE_MASK 0x0F +#define RAY_FLAG_MASK 0xF0 enum RayState { + RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ - RAY_ACTIVE = 0, + RAY_ACTIVE, /* Denotes ray has completed processing all samples and is inactive. */ - RAY_INACTIVE = 1, + RAY_INACTIVE, /* Denoted ray has exited path-iteration and needs to update output buffer. */ - RAY_UPDATE_BUFFER = 2, + RAY_UPDATE_BUFFER, /* Donotes ray has hit background */ - RAY_HIT_BACKGROUND = 3, + RAY_HIT_BACKGROUND, /* Denotes ray has to be regenerated */ - RAY_TO_REGENERATE = 4, + RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ - RAY_REGENERATED = 5, - /* Denotes ray should skip direct lighting */ - RAY_SKIP_DL = 6, - /* Flag's ray has to execute shadow blocked function in AO part */ - RAY_SHADOW_RAY_CAST_AO = 16, - /* Flag's ray has to execute shadow blocked function in direct lighting part. */ - RAY_SHADOW_RAY_CAST_DL = 32, + RAY_REGENERATED, + /* Denotes ray is moving to next iteration of the branched indirect loop */ + RAY_LIGHT_INDIRECT_NEXT_ITER, + RAY_VOLUME_INDIRECT_NEXT_ITER, + RAY_SUBSURFACE_INDIRECT_NEXT_ITER, + + /* Ray flags */ + + /* Flags to denote that the ray is currently evaluating the branched indirect loop */ + RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), + RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), + RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), + RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), + + /* Ray is evaluating an iteration of an indirect loop for another thread */ + RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) @@ -1276,6 +1455,20 @@ enum RayState { #define PATCH_MAP_NODE_IS_LEAF (1u << 31) #define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF)) +/* Work Tiles */ + +typedef struct WorkTile { + uint x, y, w, h; + + uint start_sample; + uint num_samples; + + uint offset; + uint stride; + + ccl_global float *buffer; +} WorkTile; + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index e973afe79eb..35f58850f56 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -41,12 +41,12 @@ typedef struct VolumeShaderCoefficients { /* evaluate shader to get extinction coefficient at P */ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, float3 *extinction) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER))) return false; @@ -67,12 +67,12 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, /* evaluate shader to get absorption, scattering and emission at P */ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, VolumeShaderCoefficients *coeff) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); + shader_eval_volume(kg, sd, state, state->volume_stack, state->flag); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION))) return false; @@ -115,7 +115,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel) return (channel == 0)? value.x: ((channel == 1)? value.y: value.z); } -ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack) +ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack) { for(int i = 0; stack[i].shader != SHADER_NONE; i++) { int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE); @@ -164,7 +164,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac /* homogeneous volume: assume shader evaluation at the starts gives * the extinction coefficient for the entire line segment */ -ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 sigma_t; @@ -206,7 +210,11 @@ ccl_device_inline bool kernel_volume_integrate_shadow_ray( /* heterogeneous volume: integrate stepping through the volume until we * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -214,7 +222,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; float step = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step; + float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step; /* compute extinction at the start */ float t = 0.0f; @@ -295,7 +303,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* get the volume attenuation over line segment defined by ray, with the * assumption that there are no surfaces blocking light between the endpoints */ -ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput) +ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *throughput) { shader_setup_from_volume(kg, shadow_sd, ray); @@ -313,11 +325,18 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa float t = ray->t; float delta = dot((light_P - ray->P) , ray->D); - float D = sqrtf(len_squared(light_P - ray->P) - delta * delta); + float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); + if(UNLIKELY(D == 0.0f)) { + *pdf = 0.0f; + return 0.0f; + } float theta_a = -atan2f(delta, D); float theta_b = atan2f(t - delta, D); float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a); - + if(UNLIKELY(theta_b == theta_a)) { + *pdf = 0.0f; + return 0.0f; + } *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); return min(t, delta + t_); /* min is only for float precision errors */ @@ -326,13 +345,19 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t) { float delta = dot((light_P - ray->P) , ray->D); - float D = sqrtf(len_squared(light_P - ray->P) - delta * delta); + float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); + if(UNLIKELY(D == 0.0f)) { + return 0.0f; + } float t = ray->t; float t_ = sample_t - delta; float theta_a = -atan2f(delta, D); float theta_b = atan2f(t - delta, D); + if(UNLIKELY(theta_b == theta_a)) { + return 0.0f; + } float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); @@ -396,9 +421,14 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe /* homogeneous volume: assume shader evaluation at the start gives * the volume shading coefficient for the entire line segment */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, - RNG *rng, bool probalistic_scatter) +ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + ccl_addr_space float3 *throughput, + bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -417,13 +447,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; /* decide if we will hit or miss */ bool scatter = true; - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); if(probalistic_scatter) { float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); @@ -476,7 +505,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 transmittance = volume_color_transmittance(sigma_t, ray->t); float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t); - path_radiance_accum_emission(L, *throughput, emission, state->bounce); + path_radiance_accum_emission(L, state, *throughput, emission); } /* modify throughput */ @@ -496,17 +525,27 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba } ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray( - KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, - PathRadiance *L, float3 *throughput, float t, float new_t, - float random_jitter_offset, bool has_scatter, float3 *accum_transmittance, - int channel, const float tp_eps, float *xi) + KernelGlobals *kg, + PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + float3 *throughput, + float t, + float new_t, + float random_jitter_offset, + bool has_scatter, + float3 *accum_transmittance, + int channel, + const float tp_eps, + float *xi) { float dt = new_t - t; float3 tp = *throughput; /* use random position inside this segment to sample shader */ if(new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt; float3 new_P = ray->P + ray->D * (t + random_jitter_offset); VolumeShaderCoefficients coeff; @@ -568,7 +607,7 @@ ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray( /* integrate emission attenuated by absorption */ if(L && (closure_flag & SD_EMISSION)) { float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt); - path_radiance_accum_emission(L, tp, emission, state->bounce); + path_radiance_accum_emission(L, state, tp, emission); } /* modify throughput */ @@ -606,7 +645,7 @@ ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray( * iterations. this does probabilistically scatter or get transmitted through * for path tracing where we don't want to branch. */ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng) + PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput) { VolumeIntegrateResult result = VOLUME_PATH_MISSED; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -614,7 +653,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; float step_size = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size; + float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step_size; /* compute coefficients at the start */ float t = 0.0f; @@ -622,8 +661,8 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; @@ -702,22 +741,24 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( * ray, with the assumption that there are no surfaces blocking light * between the endpoints. distance sampling is used to decide if we will * scatter or not. */ -ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, - PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous) +ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput, + bool heterogeneous) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state->rng_offset); - shader_setup_from_volume(kg, sd, ray); if(heterogeneous) - return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng); + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput); else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true); + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true); } +#ifndef __SPLIT_KERNEL__ /* Decoupled Volume Sampling * * VolumeSegment is list of coefficients and transmittance stored at all steps @@ -756,6 +797,7 @@ typedef struct VolumeSegment { * but the entire segment is needed to do always scattering, rather than probabilistically * hitting or missing the volume. if we don't know the transmittance at the end of the * volume we can't generate stratified distance samples up to that transmittance */ +#ifdef __VOLUME_DECOUPLED__ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { @@ -1026,6 +1068,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s #endif } } +#endif /* __VOLUME_DECOUPLED__ */ /* scattering for homogeneous and heterogeneous volumes, using decoupled ray * marching. @@ -1041,7 +1084,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; float xi = rscatter; /* probabilistic scattering decision based on transmittance */ @@ -1195,6 +1237,9 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); } } + if(sample_t < 0.0f || pdf == 0.0f) { + return VOLUME_PATH_MISSED; + } /* compute transmittance up to this step */ if(step != segment->steps) @@ -1216,6 +1261,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } +#endif /* __SPLIT_KERNEL */ /* decide if we need to use decoupled or not */ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) @@ -1223,6 +1269,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou /* decoupled ray marching for heterogeneous volumes not supported on the GPU, * which also means equiangular and multiple importance sampling is not * support for that case */ + if(!kernel_data.integrator.volume_decoupled) + return false; + #ifdef __KERNEL_GPU__ if(heterogeneous) return false; @@ -1247,9 +1296,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou ccl_device void kernel_volume_stack_init(KernelGlobals *kg, ShaderData *stack_sd, - const PathState *state, - const Ray *ray, - VolumeStack *stack) + ccl_addr_space const PathState *state, + ccl_addr_space const Ray *ray, + ccl_addr_space VolumeStack *stack) { /* NULL ray happens in the baker, does it need proper initialization of * camera in volume? @@ -1393,7 +1442,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg, } } -ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack) +ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack) { /* todo: we should have some way for objects to indicate if they want the * world shader to work inside them. excluding it by default is problematic @@ -1442,7 +1491,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, ShaderData *stack_sd, Ray *ray, - VolumeStack *stack) + ccl_addr_space VolumeStack *stack) { kernel_assert(kernel_data.integrator.use_volumes); @@ -1489,4 +1538,30 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, } #endif +/* Clean stack after the last bounce. + * + * It is expected that all volumes are closed manifolds, so at the time when ray + * hits nothing (for example, it is a last bounce which goes to environment) the + * only expected volume in the stack is the world's one. All the rest volume + * entries should have been exited already. + * + * This isn't always true because of ray intersection precision issues, which + * could lead us to an infinite non-world volume in the stack, causing render + * artifacts. + * + * Use this function after the last bounce to get rid of all volumes apart from + * the world's one after the last bounce to avoid render artifacts. + */ +ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, + ccl_addr_space VolumeStack *volume_stack) +{ + if(kernel_data.background.volume_shader != SHADER_NONE) { + /* Keep the world's volume in stack. */ + volume_stack[1].shader = SHADER_NONE; + } + else { + volume_stack[0].shader = SHADER_NONE; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 7d559b1aa31..0c2d9379b63 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -17,177 +17,66 @@ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ +CCL_NAMESPACE_BEGIN + /* * Utility functions for work stealing */ -#ifdef __WORK_STEALING__ - #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -uint get_group_id_with_ray_index(uint ray_index, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - int dim) +#ifdef __SPLIT_KERNEL__ +/* Returns true if there is work */ +ccl_device bool get_next_work(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { - if(dim == 0) { - uint x_span = ray_index % (tile_dim_x * parallel_samples); - return x_span / get_local_size(0); + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(ray_index >= total_work_size) { + return false; } - else /*if(dim == 1)*/ { - kernel_assert(dim == 1); - uint y_span = ray_index / (tile_dim_x * parallel_samples); - return y_span / get_local_size(1); - } -} - -uint get_total_work(uint tile_dim_x, - uint tile_dim_y, - uint grp_idx, - uint grp_idy, - uint num_samples) -{ - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return threads_within_tile_border_x * - threads_within_tile_border_y * - num_samples; -} -/* Returns 0 in case there is no next work available */ -/* Returns 1 in case work assigned is valid */ -int get_next_work(ccl_global uint *work_pool, - ccl_private uint *my_work, - uint tile_dim_x, - uint tile_dim_y, - uint num_samples, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint total_work = get_total_work(tile_dim_x, - tile_dim_y, - grp_idx, - grp_idy, - num_samples); - uint group_index = grp_idy * get_num_groups(0) + grp_idx; - *my_work = atomic_inc(&work_pool[group_index]); - return (*my_work < total_work) ? 1 : 0; -} + /* Increase atomic work index counter in pool. */ + uint pool = ray_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); -/* This function assumes that the passed my_work is valid. */ -/* Decode sample number w.r.t. assigned my_work. */ -uint get_my_sample(uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(ray_index < global_size); - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - return my_work / - (threads_within_tile_border_x * threads_within_tile_border_y); + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); } +#endif -/* Decode pixel and tile position w.r.t. assigned my_work. */ -void get_pixel_tile_position(ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint tile_offset_x, - uint tile_offset_y, - uint parallel_samples, - uint ray_index) +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - uint total_associated_pixels = - threads_within_tile_border_x * threads_within_tile_border_y; - uint work_group_pixel_index = my_work % total_associated_pixels; - uint work_group_pixel_x = - work_group_pixel_index % threads_within_tile_border_x; - uint work_group_pixel_y = - work_group_pixel_index / threads_within_tile_border_x; - - *pixel_x = - tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; - *pixel_y = - tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; - *tile_x = *pixel_x - tile_offset_x; - *tile_y = *pixel_y - tile_offset_y; + uint tile_pixels = tile->w * tile->h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; } -#endif /* __WORK_STEALING__ */ +CCL_NAMESPACE_END #endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..bf13ba62806 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,136 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset); + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int* rect, + int w, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int* rect, + int w); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..2fbb0ea2bdb --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,268 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); +#else + kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int *rect, + int w, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int *rect, + int w) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..254025be4e2 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 72dbbd9a416..7679ab4f111 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -56,9 +56,9 @@ /* do nothing */ #endif -#include "kernel.h" +#include "kernel/kernel.h" #define KERNEL_ARCH cpu -#include "kernel_cpu_impl.h" +#include "kernel/kernels/cpu/kernel_cpu_impl.h" CCL_NAMESPACE_BEGIN @@ -84,112 +84,16 @@ void kernel_tex_copy(KernelGlobals *kg, if(0) { } -#define KERNEL_TEX(type, ttype, tname) \ +#define KERNEL_TEX(type, tname) \ else if(strcmp(name, #tname) == 0) { \ kg->tname.data = (type*)mem; \ kg->tname.width = width; \ } -#define KERNEL_IMAGE_TEX(type, ttype, tname) -#include "kernel_textures.h" - - else if(strstr(name, "__tex_image_float4")) { - texture_image_float4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_float4_")); - int array_index = id; - - if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) { - tex = &kg->texture_float4_images[array_index]; - } - - if(tex) { - tex->data = (float4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_float")) { - texture_image_float *tex = NULL; - int id = atoi(name + strlen("__tex_image_float_")); - int array_index = id - TEX_START_FLOAT_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) { - tex = &kg->texture_float_images[array_index]; - } - - if(tex) { - tex->data = (float*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte4")) { - texture_image_uchar4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte4_")); - int array_index = id - TEX_START_BYTE4_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) { - tex = &kg->texture_byte4_images[array_index]; - } - - if(tex) { - tex->data = (uchar4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte")) { - texture_image_uchar *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte_")); - int array_index = id - TEX_START_BYTE_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) { - tex = &kg->texture_byte_images[array_index]; - } - - if(tex) { - tex->data = (uchar*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half4")) { - texture_image_half4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_half4_")); - int array_index = id - TEX_START_HALF4_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) { - tex = &kg->texture_half4_images[array_index]; - } - - if(tex) { - tex->data = (half4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half")) { - texture_image_half *tex = NULL; - int id = atoi(name + strlen("__tex_image_half_")); - int array_index = id - TEX_START_HALF_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) { - tex = &kg->texture_half_images[array_index]; - } - - if(tex) { - tex->data = (half*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else +#define KERNEL_IMAGE_TEX(type, tname) +#include "kernel/kernel_textures.h" + else { assert(0); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 1350d9e5c2e..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index 1a416e771ee..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 1a07c705f1c..6bdb8546a24 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -18,7 +18,6 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, float *buffer, - unsigned int *rng_state, int sample, int x, int y, int offset, @@ -42,11 +41,50 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, int offset, int sample); +/* Split kernels */ + +void KERNEL_FUNCTION_FULL_NAME(data_init)( + KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + +#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + +DECLARE_SPLIT_KERNEL_FUNCTION(path_init) +DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) +DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) +DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) +DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) + #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index af68907a5c2..37ba0f692be 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -17,62 +17,478 @@ #ifndef __KERNEL_CPU_IMAGE_H__ #define __KERNEL_CPU_IMAGE_H__ -#ifdef __KERNEL_CPU__ - CCL_NAMESPACE_BEGIN -ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y) -{ - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y); - else - return kg->texture_float4_images[tex].interp(x, y); -} +template<typename T> struct TextureInterpolator { +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + + static ccl_always_inline float4 read(float4 r) + { + return r; + } + + static ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + + static ccl_always_inline float4 read(uchar r) + { + float f = r*(1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(float r) + { + /* TODO(dingto): Optimize this, so interpolation + * happens on float instead of float4 */ + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(half4 r) + { + return half4_to_float4(r); + } + + static ccl_always_inline float4 read(half r) + { + float f = half_to_float(r); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if(x < 0) + x += width; + return x; + } + + static ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width-1); + } + + static ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; + } + + static ccl_always_inline float4 interp(const TextureInfo& info, float x, float y) + { + if(UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + const T *data = (const T*)info.data; + int width = info.width; + int height = info.height; + int ix, iy, nix, niy; + + if(info.interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[ix + iy*width]); + } + else if(info.interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); + r += (1.0f - ty)*tx*read(data[nix + iy*width]); + r += ty*(1.0f - tx)*read(data[ix + niy*width]); + r += ty*tx*read(data[nix + niy*width]); + + return r; + } + else { + /* Bicubic b-spline interpolation. */ + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data[xc[x] + yc[y]])) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); + +#undef TERM +#undef DATA + } + } + + static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; -ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z) + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + frac(z*(float)depth, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + return read(data[ix + iy*width + iz*width*height]); + } + + static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + float tz = frac(z*(float)depth - 0.5f, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + float4 r; + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); + r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); + r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); + r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); + + r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); + r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); + r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); + r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); + + return r; + } + + /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are + * causing stack overflow issue in this function unless it is inlined. + * + * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization + * enabled. + */ +#ifdef __GNUC__ + static ccl_always_inline +#else + static ccl_never_inline +#endif + float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + const T *data = (const T*)info.data; + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + + static ccl_always_inline float4 interp_3d(const TextureInfo& info, + float x, float y, float z, + InterpolationType interp) + { + if(UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(info, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(info, x, y, z); + default: + return interp_3d_tricubic(info, x, y, z); + } + } +#undef SET_CUBIC_SPLINE_WEIGHTS +}; + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z); - else - return kg->texture_float4_images[tex].interp_3d(x, y, z); + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + switch(kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp(info, x, y); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return TextureInterpolator<float4>::interp(info, x, y); + } } -ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation) +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation); - else - return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation); + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + + switch(kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp); + } } CCL_NAMESPACE_END -#endif // __KERNEL_CPU__ - - #endif // __KERNEL_CPU_IMAGE_H__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index ec82d4b4c22..fdeb7dcd3e4 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -20,43 +20,84 @@ * simply includes this file without worry of copying actual implementation over. */ -#include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_cpu_image.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_path_branched.h" -#include "kernel_bake.h" +#include "kernel/kernel_compat_cpu.h" + +#ifndef KERNEL_STUB +# ifndef __SPLIT_KERNEL__ +# include "kernel/kernel_math.h" +# include "kernel/kernel_types.h" + +# include "kernel/split/kernel_split_data.h" +# include "kernel/kernel_globals.h" + +# include "kernel/kernels/cpu/kernel_cpu_image.h" +# include "kernel/kernel_film.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" +# include "kernel/kernel_bake.h" +# else +# include "kernel/split/kernel_split_common.h" + +# include "kernel/split/kernel_data_init.h" +# include "kernel/split/kernel_path_init.h" +# include "kernel/split/kernel_scene_intersect.h" +# include "kernel/split/kernel_lamp_emission.h" +# include "kernel/split/kernel_do_volume.h" +# include "kernel/split/kernel_queue_enqueue.h" +# include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" +# include "kernel/split/kernel_shader_eval.h" +# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "kernel/split/kernel_subsurface_scatter.h" +# include "kernel/split/kernel_direct_lighting.h" +# include "kernel/split/kernel_shadow_blocked_ao.h" +# include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_enqueue_inactive.h" +# include "kernel/split/kernel_next_iteration_setup.h" +# include "kernel/split/kernel_indirect_subsurface.h" +# include "kernel/split/kernel_buffer_update.h" +# endif /* __SPLIT_KERNEL__ */ +#else +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) + +# ifdef __SPLIT_KERNEL__ +# include "kernel/split/kernel_data_init.h" +# endif /* __SPLIT_KERNEL__ */ +#endif /* KERNEL_STUB */ CCL_NAMESPACE_BEGIN +#ifndef __SPLIT_KERNEL__ + /* Path Tracing */ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, float *buffer, - unsigned int *rng_state, int sample, int x, int y, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, - rng_state, sample, x, y, offset, stride); } else -#endif +# endif { - kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(kg, buffer, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -69,6 +110,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -76,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -86,6 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -93,6 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -100,16 +149,17 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { - kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -118,17 +168,70 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif + } + else if(type == SHADER_EVAL_DISPLACE) { + kernel_displace_evaluate(kg, input, output, i); } else { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - i, - sample); + kernel_background_evaluate(kg, input, output, i); } +#endif /* KERNEL_STUB */ } +#else /* __SPLIT_KERNEL__ */ + +/* Split Kernel Path Tracing */ + +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + kernel_##name(kg); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + ccl_local type locals; \ + kernel_##name(kg, &locals); \ + } +#endif /* KERNEL_STUB */ + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +#endif /* __SPLIT_KERNEL__ */ + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp new file mode 100644 index 00000000000..ca750e5a00d --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/kernel_cpu_impl.h" + diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp new file mode 100644 index 00000000000..6ba3425a343 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp new file mode 100644 index 00000000000..76b2d77ebb8 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -0,0 +1,42 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp new file mode 100644 index 00000000000..b468b6f44c8 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp new file mode 100644 index 00000000000..3e5792d0b17 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp new file mode 100644 index 00000000000..3629f21cd29 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index a5f2d6e7294..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index 86f9ce991f8..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index c174406047d..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..c8172355a7f --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,251 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_detect_outliers(float *image, + float *variance, + float *depth, + float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, float *accum_image, + int4 rect, int w, + int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); + int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(int w, int h, + float *buffer, int *rank, + float *XtWX, float3 *XtWY, + int4 filter_area, int4 buffer_params, + int sample) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index eb2b6ea5414..3c93e00ccf1 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,134 +16,53 @@ /* CUDA kernel entry points */ -#include "../../kernel_compat_cuda.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_film.h" -#include "../../kernel_path.h" -#include "../../kernel_path_branched.h" -#include "../../kernel_bake.h" - -/* device data taken from CUDA occupancy calculator */ - #ifdef __CUDA_ARCH__ -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +#include "util/util_atomic.h" -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernels/cuda/kernel_cuda_image.h" +#include "kernel/kernel_film.h" +#include "kernel/kernel_path.h" +#include "kernel/kernel_path_branched.h" +#include "kernel/kernel_bake.h" +#include "kernel/kernel_work_stealing.h" /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); - if(x < sx + sw && y < sy + sh) - kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); + + KernelGlobals kg; + kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); + } } #ifdef __BRANCHED_PATH__ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); + + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); - if(x < sx + sw && y < sy + sh) - kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + KernelGlobals kg; + kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); + } } #endif @@ -154,8 +73,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void @@ -165,31 +85,44 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, - float4 *output, - float *output_luma, - int type, - int sx, - int sw, - int offset, - int sample) +kernel_cuda_displace(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; if(x < sx + sw) { - kernel_shader_evaluate(NULL, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + KernelGlobals kg; + kernel_displace_evaluate(&kg, input, output, x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_background(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + if(x < sx + sw) { + KernelGlobals kg; + kernel_background_evaluate(&kg, input, output, x); } } @@ -200,8 +133,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - if(x < sx + sw) - kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample); + if(x < sx + sw) { + KernelGlobals kg; + kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample); + } } #endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..7ae205b7e14 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,115 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 32 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 48 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h new file mode 100644 index 00000000000..b7be4fe4409 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h @@ -0,0 +1,310 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if __CUDA_ARCH__ >= 300 + +/* Kepler */ + +/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */ +ccl_device float cubic_w0(float a) +{ + return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); +} + +ccl_device float cubic_w1(float a) +{ + return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); +} + +ccl_device float cubic_w2(float a) +{ + return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); +} + +ccl_device float cubic_w3(float a) +{ + return (1.0f/6.0f)*(a*a*a); +} + +/* g0 and g1 are the two amplitude functions. */ +ccl_device float cubic_g0(float a) +{ + return cubic_w0(a) + cubic_w1(a); +} + +ccl_device float cubic_g1(float a) +{ + return cubic_w2(a) + cubic_w3(a); +} + +/* h0 and h1 are the two offset functions */ +ccl_device float cubic_h0(float a) +{ + /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ + return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f; +} + +ccl_device float cubic_h1(float a) +{ + return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f; +} + +/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ +template<typename T> +ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y) +{ + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + + float px = floor(x); + float py = floor(y); + float fx = x - px; + float fy = y - py; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + + return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + + g1x * tex2D<T>(tex, x1, y0)) + + cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + + g1x * tex2D<T>(tex, x1, y1)); +} + +/* Fast tricubic texture lookup using 8 bilinear lookups. */ +template<typename T> +ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z) +{ + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + z = (z * info.depth) - 0.5f; + + float px = floor(x); + float py = floor(y); + float pz = floor(z); + float fx = x - px; + float fy = y - py; + float fz = z - pz; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float g0y = cubic_g0(fy); + float g1y = cubic_g1(fy); + float g0z = cubic_g0(fz); + float g1z = cubic_g1(fz); + + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + float z0 = (pz + cubic_h0(fz)) / info.depth; + float z1 = (pz + cubic_h1(fz)) / info.depth; + + return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + + g1x * tex3D<T>(tex, x1, y0, z0)) + + g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + + g1x * tex3D<T>(tex, x1, y1, z0))) + + g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + + g1x * tex3D<T>(tex, x1, y0, z1)) + + g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + + g1x * tex3D<T>(tex, x1, y1, z1))); +} + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + + /* float4, byte4 and half4 */ + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + if(info.interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic<float4>(info, tex, x, y); + } + else { + return tex2D<float4>(tex, x, y); + } + } + /* float, byte and half */ + else { + float f; + + if(info.interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic<float>(info, tex, x, y); + } + else { + f = tex2D<float>(tex, x, y); + } + + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp; + + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + if(interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic_3d<float4>(info, tex, x, y, z); + } + else { + return tex3D<float4>(tex, x, y, z); + } + } + else { + float f; + + if(interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic_3d<float>(info, tex, x, y, z); + } + else { + f = tex3D<float>(tex, x, y, z); + } + + return make_float4(f, f, f, 1.0f); + } +} + +#else + +/* Fermi */ + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + float4 r; + switch(id) { + case 0: r = tex2D(__tex_image_float4_000, x, y); break; + case 8: r = tex2D(__tex_image_float4_008, x, y); break; + case 16: r = tex2D(__tex_image_float4_016, x, y); break; + case 24: r = tex2D(__tex_image_float4_024, x, y); break; + case 32: r = tex2D(__tex_image_float4_032, x, y); break; + case 1: r = tex2D(__tex_image_byte4_001, x, y); break; + case 9: r = tex2D(__tex_image_byte4_009, x, y); break; + case 17: r = tex2D(__tex_image_byte4_017, x, y); break; + case 25: r = tex2D(__tex_image_byte4_025, x, y); break; + case 33: r = tex2D(__tex_image_byte4_033, x, y); break; + case 41: r = tex2D(__tex_image_byte4_041, x, y); break; + case 49: r = tex2D(__tex_image_byte4_049, x, y); break; + case 57: r = tex2D(__tex_image_byte4_057, x, y); break; + case 65: r = tex2D(__tex_image_byte4_065, x, y); break; + case 73: r = tex2D(__tex_image_byte4_073, x, y); break; + case 81: r = tex2D(__tex_image_byte4_081, x, y); break; + case 89: r = tex2D(__tex_image_byte4_089, x, y); break; + case 97: r = tex2D(__tex_image_byte4_097, x, y); break; + case 105: r = tex2D(__tex_image_byte4_105, x, y); break; + case 113: r = tex2D(__tex_image_byte4_113, x, y); break; + case 121: r = tex2D(__tex_image_byte4_121, x, y); break; + case 129: r = tex2D(__tex_image_byte4_129, x, y); break; + case 137: r = tex2D(__tex_image_byte4_137, x, y); break; + case 145: r = tex2D(__tex_image_byte4_145, x, y); break; + case 153: r = tex2D(__tex_image_byte4_153, x, y); break; + case 161: r = tex2D(__tex_image_byte4_161, x, y); break; + case 169: r = tex2D(__tex_image_byte4_169, x, y); break; + case 177: r = tex2D(__tex_image_byte4_177, x, y); break; + case 185: r = tex2D(__tex_image_byte4_185, x, y); break; + case 193: r = tex2D(__tex_image_byte4_193, x, y); break; + case 201: r = tex2D(__tex_image_byte4_201, x, y); break; + case 209: r = tex2D(__tex_image_byte4_209, x, y); break; + case 217: r = tex2D(__tex_image_byte4_217, x, y); break; + case 225: r = tex2D(__tex_image_byte4_225, x, y); break; + case 233: r = tex2D(__tex_image_byte4_233, x, y); break; + case 241: r = tex2D(__tex_image_byte4_241, x, y); break; + case 249: r = tex2D(__tex_image_byte4_249, x, y); break; + case 257: r = tex2D(__tex_image_byte4_257, x, y); break; + case 265: r = tex2D(__tex_image_byte4_265, x, y); break; + case 273: r = tex2D(__tex_image_byte4_273, x, y); break; + case 281: r = tex2D(__tex_image_byte4_281, x, y); break; + case 289: r = tex2D(__tex_image_byte4_289, x, y); break; + case 297: r = tex2D(__tex_image_byte4_297, x, y); break; + case 305: r = tex2D(__tex_image_byte4_305, x, y); break; + case 313: r = tex2D(__tex_image_byte4_313, x, y); break; + case 321: r = tex2D(__tex_image_byte4_321, x, y); break; + case 329: r = tex2D(__tex_image_byte4_329, x, y); break; + case 337: r = tex2D(__tex_image_byte4_337, x, y); break; + case 345: r = tex2D(__tex_image_byte4_345, x, y); break; + case 353: r = tex2D(__tex_image_byte4_353, x, y); break; + case 361: r = tex2D(__tex_image_byte4_361, x, y); break; + case 369: r = tex2D(__tex_image_byte4_369, x, y); break; + case 377: r = tex2D(__tex_image_byte4_377, x, y); break; + case 385: r = tex2D(__tex_image_byte4_385, x, y); break; + case 393: r = tex2D(__tex_image_byte4_393, x, y); break; + case 401: r = tex2D(__tex_image_byte4_401, x, y); break; + case 409: r = tex2D(__tex_image_byte4_409, x, y); break; + case 417: r = tex2D(__tex_image_byte4_417, x, y); break; + case 425: r = tex2D(__tex_image_byte4_425, x, y); break; + case 433: r = tex2D(__tex_image_byte4_433, x, y); break; + case 441: r = tex2D(__tex_image_byte4_441, x, y); break; + case 449: r = tex2D(__tex_image_byte4_449, x, y); break; + case 457: r = tex2D(__tex_image_byte4_457, x, y); break; + case 465: r = tex2D(__tex_image_byte4_465, x, y); break; + case 473: r = tex2D(__tex_image_byte4_473, x, y); break; + case 481: r = tex2D(__tex_image_byte4_481, x, y); break; + case 489: r = tex2D(__tex_image_byte4_489, x, y); break; + case 497: r = tex2D(__tex_image_byte4_497, x, y); break; + case 505: r = tex2D(__tex_image_byte4_505, x, y); break; + case 513: r = tex2D(__tex_image_byte4_513, x, y); break; + case 521: r = tex2D(__tex_image_byte4_521, x, y); break; + case 529: r = tex2D(__tex_image_byte4_529, x, y); break; + case 537: r = tex2D(__tex_image_byte4_537, x, y); break; + case 545: r = tex2D(__tex_image_byte4_545, x, y); break; + case 553: r = tex2D(__tex_image_byte4_553, x, y); break; + case 561: r = tex2D(__tex_image_byte4_561, x, y); break; + case 569: r = tex2D(__tex_image_byte4_569, x, y); break; + case 577: r = tex2D(__tex_image_byte4_577, x, y); break; + case 585: r = tex2D(__tex_image_byte4_585, x, y); break; + case 593: r = tex2D(__tex_image_byte4_593, x, y); break; + case 601: r = tex2D(__tex_image_byte4_601, x, y); break; + case 609: r = tex2D(__tex_image_byte4_609, x, y); break; + case 617: r = tex2D(__tex_image_byte4_617, x, y); break; + case 625: r = tex2D(__tex_image_byte4_625, x, y); break; + case 633: r = tex2D(__tex_image_byte4_633, x, y); break; + case 641: r = tex2D(__tex_image_byte4_641, x, y); break; + case 649: r = tex2D(__tex_image_byte4_649, x, y); break; + case 657: r = tex2D(__tex_image_byte4_657, x, y); break; + case 665: r = tex2D(__tex_image_byte4_665, x, y); break; + default: r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return r; +} + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) +{ + float4 r; + switch(id) { + case 0: r = tex3D(__tex_image_float4_3d_000, x, y, z); break; + case 8: r = tex3D(__tex_image_float4_3d_008, x, y, z); break; + case 16: r = tex3D(__tex_image_float4_3d_016, x, y, z); break; + case 24: r = tex3D(__tex_image_float4_3d_024, x, y, z); break; + case 32: r = tex3D(__tex_image_float4_3d_032, x, y, z); break; + } + return r; +} + +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..43b3d0aa0e6 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,148 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" +#include "kernel/split/kernel_path_init.h" +#include "kernel/split/kernel_scene_intersect.h" +#include "kernel/split/kernel_lamp_emission.h" +#include "kernel/split/kernel_do_volume.h" +#include "kernel/split/kernel_queue_enqueue.h" +#include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" +#include "kernel/split/kernel_shader_eval.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/split/kernel_subsurface_scatter.h" +#include "kernel/split/kernel_direct_lighting.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_enqueue_inactive.h" +#include "kernel/split/kernel_next_iteration_setup.h" +#include "kernel/split/kernel_indirect_subsurface.h" +#include "kernel/split/kernel_buffer_update.h" + +#include "kernel/kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size) +{ + *size = split_data_buffer_size(NULL, num_threads); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + ccl_local type locals; \ + kernel_##name(NULL, &locals); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..7a7b596a350 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,276 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, + int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, + int w) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, + int h, + int f, + int pass_stride) +{ + int x = get_global_id(0) + max(0, rect.x-filter_rect.x); + int y = get_global_id(1) + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(int w, + int h, + ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index a68f97857b6..9d5d784e140 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -16,45 +16,42 @@ /* OpenCL kernel entry points - unfinished */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_image_opencl.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernels/opencl/kernel_opencl_image.h" -#include "../../kernel_film.h" +#include "kernel/kernel_film.h" #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) -# include "../../kernel_path.h" -# include "../../kernel_path_branched.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" #else /* __COMPILE_ONLY_MEGAKERNEL__ */ /* Include only actually used headers for the case * when path tracing kernels are not needed. */ -# include "../../kernel_random.h" -# include "../../kernel_differential.h" -# include "../../kernel_montecarlo.h" -# include "../../kernel_projection.h" -# include "../../geom/geom.h" -# include "../../bvh/bvh.h" - -# include "../../kernel_accumulate.h" -# include "../../kernel_camera.h" -# include "../../kernel_shader.h" +# include "kernel/kernel_random.h" +# include "kernel/kernel_differential.h" +# include "kernel/kernel_montecarlo.h" +# include "kernel/kernel_projection.h" +# include "kernel/geom/geom.h" +# include "kernel/bvh/bvh.h" + +# include "kernel/kernel_accumulate.h" +# include "kernel/kernel_camera.h" +# include "kernel/kernel_shader.h" #endif /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */ -#include "../../kernel_bake.h" +#include "kernel/kernel_bake.h" #ifdef __COMPILE_ONLY_MEGAKERNEL__ __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, ccl_global float *buffer, - ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -63,28 +60,24 @@ __kernel void kernel_ocl_path_trace( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) - kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(kg, buffer, sample, x, y, offset, stride); } #else /* __COMPILE_ONLY_MEGAKERNEL__ */ -__kernel void kernel_ocl_shader( +__kernel void kernel_ocl_displace( ccl_constant KernelData *data, ccl_global uint4 *input, ccl_global float4 *output, - ccl_global float *output_luma, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int sx, int sw, int offset, int sample) { @@ -92,20 +85,35 @@ __kernel void kernel_ocl_shader( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + kernel_displace_evaluate(kg, input, output, x); + } +} +__kernel void kernel_ocl_background( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + + KERNEL_BUFFER_PARAMS, + + int type, int sx, int sw, int offset, int sample) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); + + int x = sx + ccl_global_id(0); + + if(x < sx + sw) { + kernel_background_evaluate(kg, input, output, x); } } @@ -114,9 +122,7 @@ __kernel void kernel_ocl_bake( ccl_global uint4 *input, ccl_global float4 *output, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int filter, int sx, int sw, int offset, int sample) { @@ -124,11 +130,10 @@ __kernel void kernel_ocl_bake( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { #ifdef __NO_BAKING__ @@ -144,9 +149,7 @@ __kernel void kernel_ocl_convert_to_byte( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -155,12 +158,11 @@ __kernel void kernel_ocl_convert_to_byte( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); @@ -171,9 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -182,15 +182,30 @@ __kernel void kernel_ocl_convert_to_half_float( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) +{ + size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} + #endif /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl deleted file mode 100644 index 1914d241eb1..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_background_buffer_update.h" - -__kernel void kernel_ocl_path_trace_background_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - ccl_global int *Queue_data, /* Queues memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(ray_index == 0) { - /* We will empty this queue in this kernel. */ - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - char enqueue_flag = 0; - ray_index = get_ray_index(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - Queue_data, - queuesize, - 1); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = - kernel_background_buffer_update((KernelGlobals *)kg, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - sw, sh, sx, sy, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - work_array, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; - * These rays will be made active during next SceneIntersectkernel. - */ - enqueue_ray_index_local(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl new file mode 100644 index 00000000000..dcea2630aef --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_buffer_update.h" + +#define KERNEL_NAME buffer_update +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 18139687eab..7125348a49f 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -14,77 +14,40 @@ * limitations under the License. */ -#include "split/kernel_data_init.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" __kernel void kernel_ocl_path_trace_data_init( - ccl_global char *globals, - ccl_global char *sd_DL_shadow, + ccl_global char *kg, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" - - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + KERNEL_BUFFER_PARAMS, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global float *buffer) { - kernel_data_init((KernelGlobals *)globals, - (ShaderData *)sd_DL_shadow, + kernel_data_init((KernelGlobals*)kg, data, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, + split_data_buffer, + num_elements, ray_state, - -#define KERNEL_TEX(type, ttype, name) name, -#include "../../kernel_textures.h" - - start_sample, sx, sy, sw, sh, offset, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - Queue_data, + KERNEL_BUFFER_ARGS, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, Queue_index, queuesize, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples); + buffer); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index c6a2c8d050c..ed64ae01aae 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -14,74 +14,13 @@ * limitations under the License. */ -#include "split/kernel_direct_lighting.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_direct_lighting.h" -__kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME direct_lighting +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - ray_index); - -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -#endif -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl new file mode 100644 index 00000000000..8afaa686e28 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_do_volume.h" + +#define KERNEL_NAME do_volume +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl new file mode 100644 index 00000000000..e68d4104a91 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_enqueue_inactive.h" + +#define KERNEL_NAME enqueue_inactive +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index e063614da1a..9e1e57beba6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -14,110 +14,13 @@ * limitations under the License. */ -#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - ccl_local unsigned int local_queue_atomics_bg; - ccl_local unsigned int local_queue_atomics_ao; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics_bg = 0; - local_queue_atomics_ao = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao +#define LOCALS_TYPE BackgroundAOLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif /* __COMPUTE_DEVICE_GPU__ */ - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals *)kg, - (ShaderData *)sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - sw, sh, sx, sy, stride, - ray_state, - work_array, -#ifdef __WORK_STEALING__ - start_sample, -#endif - parallel_samples, - ray_index, - &enqueue_flag, - &enqueue_flag_AO_SHADOW_RAY_CAST); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics_bg, - Queue_data, - Queue_index); - -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, - queuesize, - &local_queue_atomics_ao, - Queue_data, - Queue_index); -#endif -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl new file mode 100644 index 00000000000..192d01444ba --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_background.h" + +#define KERNEL_NAME indirect_background +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl new file mode 100644 index 00000000000..84938b889e5 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_subsurface.h" + +#define KERNEL_NAME indirect_subsurface +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 267bddc2ffc..c314dc96c33 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -14,67 +14,11 @@ * limitations under the License. */ -#include "split/kernel_lamp_emission.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_lamp_emission.h" -__kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - int x = get_global_id(0); - int y = get_global_id(1); +#define KERNEL_NAME lamp_emission +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - /* We will empty this queue in this kernel. */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } - /* Fetch use_queues_flag. */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_lamp_emission((KernelGlobals *)kg, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d49b6294a8..8b1332bf013 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -14,101 +14,13 @@ * limitations under the License. */ -#include "split/kernel_next_iteration_setup.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_next_iteration_setup.h" -__kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME next_iteration_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed atleast once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - use_queues_flag[0] = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the - * previous kernel. - */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_dl_coop, - ISLamp_coop, - BSDFEval_coop, - LightRay_ao_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - use_queues_flag, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h new file mode 100644 index 00000000000..d908af78c7a --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -0,0 +1,341 @@ +/* + * Copyright 2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* For OpenCL we do manual lookup and interpolation. */ + +ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) { + const uint tex_offset = id +#define KERNEL_TEX(type, name) + 1 +#include "kernel/kernel_textures.h" + ; + + return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset]; +} + +#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)] + +ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + const int texture_type = kernel_tex_type(id); + + /* Float4 */ + if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch(float4, info, offset); + } + /* Byte4 */ + else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch(uchar4, info, offset); + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + /* Float */ + else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch(float, info, offset); + return make_float4(f, f, f, 1.0f); + } + /* Byte */ + else { + uchar r = tex_fetch(uchar, info, offset); + float f = r * (1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if(x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width-1); +} + +ccl_device_inline float svm_image_texture_frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; +} + +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint interpolation = info->interpolation; + uint extension = info->extension; + + /* Actual sampling. */ + if(interpolation == INTERPOLATION_CLOSEST) { + int ix, iy; + svm_image_texture_frac(x*width, &ix); + svm_image_texture_frac(y*height, &iy); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + } + + return svm_image_texture_read(kg, id, ix + iy*width); + } + else { + /* Bilinear or bicubic interpolation. */ + int ix, iy, nix, niy; + float tx = svm_image_texture_frac(x*width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*height - 0.5f, &iy); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + nix = svm_image_texture_wrap_periodic(ix+1, width); + niy = svm_image_texture_wrap_periodic(iy+1, height); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + nix = svm_image_texture_wrap_clamp(ix+1, width); + niy = svm_image_texture_wrap_clamp(iy+1, height); + } + + if(interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + float4 r; + r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width); + r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width); + r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width); + r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width); + return r; + } + + /* Bicubic interpolation. */ + int pix, piy, nnix, nniy; + if(extension == EXTENSION_REPEAT) { + pix = svm_image_texture_wrap_periodic(ix-1, width); + piy = svm_image_texture_wrap_periodic(iy-1, height); + nnix = svm_image_texture_wrap_periodic(ix+2, width); + nniy = svm_image_texture_wrap_periodic(iy+2, height); + } + else { + pix = svm_image_texture_wrap_clamp(ix-1, width); + piy = svm_image_texture_wrap_clamp(iy-1, height); + nnix = svm_image_texture_wrap_clamp(ix+2, width); + nniy = svm_image_texture_wrap_clamp(iy+2, height); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y])) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); +#undef TERM +#undef DATA + } +} + + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint depth = info->depth; + uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp; + uint extension = info->extension; + + /* Actual sampling. */ + if(interpolation == INTERPOLATION_CLOSEST) { + int ix, iy, iz; + svm_image_texture_frac(x*width, &ix); + svm_image_texture_frac(y*height, &iy); + svm_image_texture_frac(z*depth, &iz); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + iz = svm_image_texture_wrap_periodic(iz, depth); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + iz = svm_image_texture_wrap_clamp(iz, depth); + } + return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height); + } + else { + /* Bilinear or bicubic interpolation. */ + int ix, iy, iz, nix, niy, niz; + float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + iz = svm_image_texture_wrap_periodic(iz, depth); + + nix = svm_image_texture_wrap_periodic(ix+1, width); + niy = svm_image_texture_wrap_periodic(iy+1, height); + niz = svm_image_texture_wrap_periodic(iz+1, depth); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + nix = svm_image_texture_wrap_clamp(ix+1, width); + niy = svm_image_texture_wrap_clamp(iy+1, height); + niz = svm_image_texture_wrap_clamp(iz+1, depth); + + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + iz = svm_image_texture_wrap_clamp(iz, depth); + } + + if(interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + float4 r; + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + iz*width*height); + r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + iz*width*height); + r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + iz*width*height); + r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + iz*width*height); + + r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + niz*width*height); + r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + niz*width*height); + r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + niz*width*height); + r += tz*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + niz*width*height); + return r; + } + + /* Bicubic interpolation. */ + int pix, piy, piz, nnix, nniy, nniz; + if(extension == EXTENSION_REPEAT) { + pix = svm_image_texture_wrap_periodic(ix-1, width); + piy = svm_image_texture_wrap_periodic(iy-1, height); + piz = svm_image_texture_wrap_periodic(iz-1, depth); + nnix = svm_image_texture_wrap_periodic(ix+2, width); + nniy = svm_image_texture_wrap_periodic(iy+2, height); + nniz = svm_image_texture_wrap_periodic(iz+2, depth); + } + else { + pix = svm_image_texture_wrap_clamp(ix-1, width); + piy = svm_image_texture_wrap_clamp(iy-1, height); + piz = svm_image_texture_wrap_clamp(iz-1, depth); + nnix = svm_image_texture_wrap_clamp(ix+2, width); + nniy = svm_image_texture_wrap_clamp(iy+2, height); + nniz = svm_image_texture_wrap_clamp(iz+2, depth); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (svm_image_texture_read(kg, id, xc[x] + yc[y] + zc[z])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } +} + +#undef SET_CUBIC_SPLINE_WEIGHTS diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl new file mode 100644 index 00000000000..fa210e747c0 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_path_init.h" + +#define KERNEL_NAME path_init +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 3156dc255fb..68ee6f1d536 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -14,93 +14,13 @@ * limitations under the License. */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_queues.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_queue_enqueue.h" -/* - * The kernel "kernel_queue_enqueue" enqueues rays of - * different ray state into their appropriate Queues; - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel - * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output of the kernel is as follows, - * - * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | - * queuesize -------------------------------------------| | - * - * Note on Queues : - * State of queues during the first time this kernel is called : - * At entry, - * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. - * - * State of queue during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ -__kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int queuesize) /* Size (capacity) of each queue */ -{ - /* We have only 2 cases (Hit/Not-Hit) */ - ccl_local unsigned int local_queue_atomics[2]; - - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - - if(lidx < 2 ) { - local_queue_atomics[lidx] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } - else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if(queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidx == 0) { - local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = - get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, - local_queue_atomics, - Queue_index); - local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = - get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - local_queue_atomics, - Queue_index); - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME queue_enqueue +#define LOCALS_TYPE QueueEnqueueLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - unsigned int my_gqidx; - if(queue_number != -1) { - my_gqidx = get_global_queue_index(queue_number, - queuesize, - my_lqidx, - local_queue_atomics); - Queue_data[my_gqidx] = ray_index; - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index 7f3f433c7a6..10d09377ba9 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -14,67 +14,11 @@ * limitations under the License. */ -#include "split/kernel_scene_intersect.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_scene_intersect.h" -__kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - int x = get_global_id(0); - int y = get_global_id(1); +#define KERNEL_NAME scene_intersect +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_scene_intersect((KernelGlobals *)kg, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index c37856c8f30..40eaa561863 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -14,55 +14,11 @@ * limitations under the License. */ -#include "split/kernel_shader_eval.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_eval.h" -__kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME shader_eval +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); - - /* Continue on with shader evaluation. */ - kernel_shader_eval((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl new file mode 100644 index 00000000000..8c36100f762 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_setup.h" + +#define KERNEL_NAME shader_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl new file mode 100644 index 00000000000..bcacaa4a054 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_sort.h" + +__attribute__((reqd_work_group_size(64, 1, 1))) +#define KERNEL_NAME shader_sort +#define LOCALS_TYPE ShaderSortLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl deleted file mode 100644 index edf76fba714..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_shadow_blocked.h" - -__kernel void kernel_ocl_path_trace_shadow_blocked( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); - - ccl_local unsigned int ao_queue_length; - ccl_local unsigned int dl_queue_length; - if(lidx == 0) { - ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* flag determining if the current ray is to process shadow ray for AO or DL */ - char shadow_blocked_type = -1; - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(thread_index < ao_queue_length + dl_queue_length) { - if(thread_index < ao_queue_length) { - ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; - } else { - ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; - } - } - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - kernel_shadow_blocked((KernelGlobals *)kg, - PathState_coop, - LightRay_dl_coop, - LightRay_ao_coop, - ray_state, - shadow_blocked_type, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl new file mode 100644 index 00000000000..8de250a375c --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" + +#define KERNEL_NAME shadow_blocked_ao +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl new file mode 100644 index 00000000000..29da77022ed --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" + +#define KERNEL_NAME shadow_blocked_dl +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl new file mode 100644 index 00000000000..4cbda1bc2e7 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" // PRECOMPILED +#include "kernel/split/kernel_split_common.h" // PRECOMPILED + +#include "kernel/kernels/opencl/kernel_state_buffer_size.cl" +#include "kernel/kernels/opencl/kernel_data_init.cl" +#include "kernel/kernels/opencl/kernel_path_init.cl" + +#include "kernel/kernels/opencl/kernel_scene_intersect.cl" +#include "kernel/kernels/opencl/kernel_lamp_emission.cl" +#include "kernel/kernels/opencl/kernel_do_volume.cl" +#include "kernel/kernels/opencl/kernel_indirect_background.cl" +#include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_setup.cl" +#include "kernel/kernels/opencl/kernel_shader_sort.cl" +#include "kernel/kernels/opencl/kernel_shader_eval.cl" +#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" +#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" +#include "kernel/kernels/opencl/kernel_direct_lighting.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" +#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" +#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" +#include "kernel/kernels/opencl/kernel_buffer_update.cl" + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h new file mode 100644 index 00000000000..6aa7681cbed --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) + +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( + ccl_global char *kg_global, + ccl_constant KernelData *data, + + ccl_global void *split_data_buffer, + ccl_global char *ray_state, + + KERNEL_BUFFER_PARAMS, + + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer + ) +{ +#ifdef LOCALS_TYPE + ccl_local LOCALS_TYPE locals; +#endif + + KernelGlobals *kg = (KernelGlobals*)kg_global; + + if(ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; + + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.tile.buffer = buffer; + + split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + + } + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + + KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( + kg +#ifdef LOCALS_TYPE + , &locals +#endif + ); +} + +#undef KERNEL_NAME_JOIN +#undef KERNEL_NAME_EVAL + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl new file mode 100644 index 00000000000..c10ecc426c6 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" + +__kernel void kernel_ocl_path_trace_state_buffer_size( + ccl_global char *kg, + ccl_constant KernelData *data, + uint num_threads, + ccl_global uint64_t *size) +{ + ((KernelGlobals*)kg)->data = data; + *size = split_data_buffer_size((KernelGlobals*)kg, num_threads); +} + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl new file mode 100644 index 00000000000..2b3be38df84 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_subsurface_scatter.h" + +#define KERNEL_NAME subsurface_scatter +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl deleted file mode 100644 index 88a1ed830af..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_sum_all_radiance.h" - -__kernel void kernel_ocl_path_trace_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - kernel_sum_all_radiance(data, - buffer, - per_sample_output_buffer, - parallel_samples, - sw, sh, stride, - buffer_offset_x, - buffer_offset_y, - buffer_stride, - start_sample); -} diff --git a/intern/cycles/kernel/openvdb/vdb_intern.h b/intern/cycles/kernel/openvdb/vdb_intern.h index 71d6b81e0ff..0ebb0eed094 100644 --- a/intern/cycles/kernel/openvdb/vdb_intern.h +++ b/intern/cycles/kernel/openvdb/vdb_intern.h @@ -33,7 +33,7 @@ # pragma GCC diagnostic pop #endif -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt index 98de40e5a8a..d2eb89e0e0a 100644 --- a/intern/cycles/kernel/osl/CMakeLists.txt +++ b/intern/cycles/kernel/osl/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - .. - ../svm - ../../graph - ../../render - ../../util - ../../device + ../.. ) set(INC_SYS diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp index d835f9be45c..2e73e7a601e 100644 --- a/intern/cycles/kernel/osl/background.cpp +++ b/intern/cycles/kernel/osl/background.cpp @@ -34,10 +34,10 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "closure/alloc.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/closure/alloc.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index bc26f42b559..ea18f2c8c86 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -34,13 +34,13 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_diffuse_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index 14c7644936e..a26671eb09e 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/bsdf_phong_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_phong_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp index 3f13e08b302..8843a196dad 100644 --- a/intern/cycles/kernel/osl/emissive.cpp +++ b/intern/cycles/kernel/osl/emissive.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/emissive.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/emissive.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 3614717e28c..27a96720c1e 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -32,15 +32,17 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bssrdf.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bssrdf.h" CCL_NAMESPACE_BEGIN @@ -78,7 +80,8 @@ public: bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + bssrdf->roughness = params.roughness; + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -89,7 +92,8 @@ public: bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + bssrdf->roughness = params.roughness; + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -100,7 +104,8 @@ public: bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + bssrdf->roughness = params.roughness; + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } } @@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params() CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure) +/* Disney principled */ + +class PrincipledBSSRDFClosure : public CBSSRDFClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); + } +}; + +ClosureParam *closure_bssrdf_principled_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness), + CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure) + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 94de782dca0..14c5c1c3db5 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -33,33 +33,36 @@ #include <OSL/genclosure.h> #include <OSL/oslclosure.h> -#include "osl_closures.h" -#include "osl_shader.h" - -#include "util_debug.h" -#include "util_math.h" -#include "util_param.h" - -#include "kernel_types.h" -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_montecarlo.h" -#include "kernel_random.h" - -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf_ashikhmin_velvet.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bsdf_microfacet.h" -#include "closure/bsdf_microfacet_multi.h" -#include "closure/bsdf_oren_nayar.h" -#include "closure/bsdf_reflection.h" -#include "closure/bsdf_refraction.h" -#include "closure/bsdf_transparent.h" -#include "closure/bsdf_ashikhmin_shirley.h" -#include "closure/bsdf_toon.h" -#include "closure/bsdf_hair.h" -#include "closure/volume.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_shader.h" + +#include "util/util_debug.h" +#include "util/util_math.h" +#include "util/util_param.h" + +#include "kernel/kernel_types.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_random.h" + +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" +#include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN @@ -153,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -161,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection) BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -175,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein) VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR) VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption) +BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness), +BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse) + +BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N), +BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen) + +/* DISNEY PRINCIPLED CLEARCOAT */ +class PrincipledClearcoatClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float clearcoat, clearcoat_roughness; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + + bsdf->ior = 1.5f; + + bsdf->alpha_x = clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + return bsdf; + } + + return NULL; + } + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_principled_clearcoat_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness), + CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure) + + /* Registration */ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare) @@ -214,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare); register_closure(ss, "microfacet_multi_ggx_aniso", id++, closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare); + register_closure(ss, "microfacet_ggx_fresnel", id++, + closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare); + register_closure(ss, "microfacet_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); register_closure(ss, "microfacet_beckmann_aniso", id++, @@ -228,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); + register_closure(ss, "principled_diffuse", id++, + bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare); + register_closure(ss, "principled_sheen", id++, + bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare); + register_closure(ss, "principled_clearcoat", id++, + closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -247,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); register_closure(ss, "bssrdf_burley", id++, closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare); + register_closure(ss, "bssrdf_principled", id++, + closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); @@ -277,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) return false; } + +/* GGX closures with Fresnel */ + +class MicrofacetFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX Glass closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure); + +class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure); + + /* Multiscattering GGX closures */ class MicrofacetMultiClosure : public CBSDFClosure { @@ -286,7 +444,7 @@ public: MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) { - /* Technically, the MultiGGX Glass closure may also transmit. However, + /* Technically, the MultiGGX closure may also transmit. However, * since this is set statically and only used for caustic flags, this * is probably as good as it gets. */ if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) { @@ -374,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params() } CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure); + +/* Multiscattering GGX closures with Fresnel */ + +class MicrofacetMultiFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure); + +class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure); + +class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {} + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure); + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index cd7b33703ff..ff5fd9cc905 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -33,8 +33,8 @@ #ifndef __OSL_CLOSURES_H__ #define __OSL_CLOSURES_H__ -#include "util_types.h" -#include "kernel_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" #include <OSL/oslclosure.h> #include <OSL/oslexec.h> @@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params(); OSL::ClosureParam *closure_bssrdf_cubic_params(); OSL::ClosureParam *closure_bssrdf_gaussian_params(); OSL::ClosureParam *closure_bssrdf_burley_params(); +OSL::ClosureParam *closure_bssrdf_principled_params(); OSL::ClosureParam *closure_henyey_greenstein_volume_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_principled_clearcoat_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); void closure_background_prepare(OSL::RendererServices *, int id, void *data); @@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data); +void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data); #define CCLOSURE_PREPARE(name, classname) \ void name(RendererServices *, int id, void *data) \ diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 65cb7ecc6b4..9585d9f4825 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -21,10 +21,10 @@ #include <OSL/oslexec.h> -#include "util_map.h" -#include "util_param.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_thread.h" +#include "util/util_vector.h" #ifndef WIN32 using std::isfinite; @@ -86,7 +86,7 @@ struct OSLThreadData { OSL::ShaderGlobals globals; OSL::PerThreadInfo *osl_thread_info; OSLTraceData tracedata; - OSL::ShadingContext *context[SHADER_CONTEXT_NUM]; + OSL::ShadingContext *context; OIIO::TextureSystem::Perthread *oiio_thread_info; }; diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index bc093272eca..c220a5ee3a1 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -25,38 +25,38 @@ #include <string.h> -#include "mesh.h" -#include "object.h" -#include "scene.h" - -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" - -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_differential.h" -#include "kernel_montecarlo.h" -#include "kernel_camera.h" - -#include "kernels/cpu/kernel_cpu_image.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" + +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" + +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernels/cpu/kernel_cpu_image.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" /* Note: "util_foreach.h" needs to be included after "kernel_compat_cpu.h", as * for some reason ccl::foreach conflicts with openvdb::tools::foreach, which is * indirectly included through "kernel_compat_cpu.h". */ -#include "util_foreach.h" -#include "util_logging.h" -#include "util_string.h" -#include "geom/geom.h" -#include "bvh/bvh.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_string.h" -#include "kernel_projection.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" #ifdef WITH_PTEX # include <Ptexture.h> @@ -107,6 +107,8 @@ ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal"); #endif ustring OSLRenderServices::u_path_ray_length("path:ray_length"); ustring OSLRenderServices::u_path_ray_depth("path:ray_depth"); +ustring OSLRenderServices::u_path_diffuse_depth("path:diffuse_depth"); +ustring OSLRenderServices::u_path_glossy_depth("path:glossy_depth"); ustring OSLRenderServices::u_path_transparent_depth("path:transparent_depth"); ustring OSLRenderServices::u_path_transmission_depth("path:transmission_depth"); ustring OSLRenderServices::u_trace("trace"); @@ -715,7 +717,7 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD else motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P); - if(!(sd->flag & SD_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &P[0]); object_position_transform(kg, sd, &P[1]); object_position_transform(kg, sd, &P[2]); @@ -764,6 +766,24 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * int f = state->bounce; return set_attribute_int(f, type, derivatives, val); } + else if(name == u_path_diffuse_depth) { + /* Diffuse Ray Depth */ + PathState *state = sd->osl_path_state; + int f = state->diffuse_bounce; + return set_attribute_int(f, type, derivatives, val); + } + else if(name == u_path_glossy_depth) { + /* Glossy Ray Depth */ + PathState *state = sd->osl_path_state; + int f = state->glossy_bounce; + return set_attribute_int(f, type, derivatives, val); + } + else if(name == u_path_transmission_depth) { + /* Transmission Ray Depth */ + PathState *state = sd->osl_path_state; + int f = state->transmission_bounce; + return set_attribute_int(f, type, derivatives, val); + } else if(name == u_path_transparent_depth) { /* Transparent Ray Depth */ PathState *state = sd->osl_path_state; @@ -808,7 +828,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if(sg->renderstate == NULL) + if(sg == NULL || sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); @@ -946,7 +966,7 @@ bool OSLRenderServices::texture(ustring filename, if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); + float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t); result[0] = rgba[0]; if(nchannels > 1) @@ -1027,7 +1047,7 @@ bool OSLRenderServices::texture3d(ustring filename, bool status; if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); + float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z, INTERPOLATION_NONE); result[0] = rgba[0]; if(nchannels > 1) @@ -1181,8 +1201,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->init = true; tracedata->sd.osl_globals = sd->osl_globals; - /* raytrace */ - return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); + /* Raytrace, leaving out shadow opaque to avoid early exit. */ + uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE; + return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 0f2e02c62b0..ec34ca77115 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -165,6 +165,8 @@ public: static ustring u_curve_tangent_normal; static ustring u_path_ray_length; static ustring u_path_ray_depth; + static ustring u_path_diffuse_depth; + static ustring u_path_glossy_depth; static ustring u_path_transparent_depth; static ustring u_path_transmission_depth; static ustring u_trace; diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 0d762bbdb38..6b3a996ca12 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -16,21 +16,22 @@ #include <OSL/oslexec.h> -#include "kernel_compat_cpu.h" -#include "kernel_montecarlo.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" -#include "geom/geom_object.h" +#include "kernel/geom/geom_object.h" -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" -#include "util_foreach.h" +#include "util/util_foreach.h" -#include "attribute.h" +#include "render/attribute.h" CCL_NAMESPACE_BEGIN @@ -56,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS tdata->globals.tracedata = &tdata->tracedata; tdata->globals.flipHandedness = false; tdata->osl_thread_info = ss->create_thread_info(); - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - tdata->context[i] = ss->get_context(tdata->osl_thread_info); + tdata->context = ss->get_context(tdata->osl_thread_info); tdata->oiio_thread_info = osl_globals->ts->get_perthread_info(); @@ -73,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg) OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSLThreadData *tdata = kg->osl_tdata; - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - ss->release_context(tdata->context[i]); + ss->release_context(tdata->context); ss->destroy_thread_info(tdata->osl_thread_info); @@ -172,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, } } -void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -181,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; /* automatic bump shader */ @@ -273,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd, } } -void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -282,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; if(kg->osl->background_state) { ss->execute(octx, *(kg->osl->background_state), *globals); @@ -328,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } -void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -337,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->volume_state[shader]) { @@ -351,19 +348,17 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) +void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; - PathState state = {0}; - - shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata); + shaderdata_to_shaderglobals(kg, sd, state, 0, tdata); /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->displacement_state[shader]) { diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index ad06dd6929d..6b392b25cf7 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -29,7 +29,7 @@ * This means no thread state must be passed along in the kernel itself. */ -#include "kernel_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -53,10 +53,10 @@ public: static void thread_free(KernelGlobals *kg); /* eval */ - static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx); + static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state); /* attributes */ static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index b43f8402d42..1a8ed4c884a 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -81,13 +81,15 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_principled_bsdf.osl node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS - node_texture.h node_color.h node_fresnel.h + node_ramp_util.h + node_texture.h stdosl.h oslutil.h ) diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl index a021a40467d..64fe4c20132 100644 --- a/intern/cycles/kernel/shaders/node_light_path.osl +++ b/intern/cycles/kernel/shaders/node_light_path.osl @@ -27,6 +27,8 @@ shader node_light_path( output float IsVolumeScatterRay = 0.0, output float RayLength = 0.0, output float RayDepth = 0.0, + output float DiffuseDepth = 0.0, + output float GlossyDepth = 0.0, output float TransparentDepth = 0.0, output float TransmissionDepth = 0.0) { @@ -45,6 +47,14 @@ shader node_light_path( getattribute("path:ray_depth", ray_depth); RayDepth = (float)ray_depth; + int diffuse_depth; + getattribute("path:diffuse_depth", diffuse_depth); + DiffuseDepth = (float)diffuse_depth; + + int glossy_depth; + getattribute("path:glossy_depth", glossy_depth); + GlossyDepth = (float)glossy_depth; + int transparent_depth; getattribute("path:transparent_depth", transparent_depth); TransparentDepth = (float)transparent_depth; diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl new file mode 100644 index 00000000000..6870d479af3 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" +#include "node_fresnel.h" + +shader node_principled_bsdf( + string distribution = "Multiscatter GGX", + color BaseColor = color(0.8, 0.8, 0.8), + float Subsurface = 0.0, + vector SubsurfaceRadius = vector(1.0, 1.0, 1.0), + color SubsurfaceColor = color(0.7, 0.1, 0.1), + float Metallic = 0.0, + float Specular = 0.5, + float SpecularTint = 0.0, + float Roughness = 0.5, + float Anisotropic = 0.0, + float AnisotropicRotation = 0.0, + float Sheen = 0.0, + float SheenTint = 0.5, + float Clearcoat = 0.0, + float ClearcoatRoughness = 0.03, + float IOR = 1.45, + float Transmission = 0.0, + float TransmissionRoughness = 0.0, + normal Normal = N, + normal ClearcoatNormal = N, + normal Tangent = normalize(dPdu), + output closure color BSDF = 0) +{ + float f = max(IOR, 1e-5); + float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0)); + float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0)); + float specular_weight = (1.0 - final_transmission); + + vector T = Tangent; + + float m_cdlum = luminance(BaseColor); + color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat + + /* rotate tangent */ + if (AnisotropicRotation != 0.0) + T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal); + + if (diffuse_weight > 1e-5) { + if (Subsurface > 1e-5) { + color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); + BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness); + } else { + BSDF = BaseColor * principled_diffuse(Normal, Roughness); + } + + if (Sheen > 1e-5) { + color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint; + + BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal); + } + + BSDF = BSDF * diffuse_weight; + } + + if (specular_weight > 1e-5) { + float aspect = sqrt(1.0 - Anisotropic * 0.9); + float r2 = Roughness * Roughness; + + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; + + color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; + + color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic; + + if (distribution == "GGX" || Roughness <= 0.075) { + BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } else { + BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } + } + + if (final_transmission > 1e-5) { + color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint); + float eta = backfacing() ? 1.0 / f : f; + + if (distribution == "GGX" || Roughness <= 5e-2) { + float cosNO = dot(Normal, I); + float Fr = fresnel_dielectric_cos(cosNO, eta); + + float refl_roughness = Roughness; + if (Roughness <= 1e-2) + refl_roughness = 0.0; + + float transmission_roughness = refl_roughness; + if (distribution == "GGX") + transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness); + + BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) + + (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta)); + } else { + BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0); + } + } + + if (Clearcoat > 1e-5) { + BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness); + } +} + diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index a8dda8a12c9..c91d2918687 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN; closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN; closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN; +closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; @@ -539,11 +544,15 @@ closure color emission() BUILTIN; closure color background() BUILTIN; closure color holdout() BUILTIN; closure color ambient_occlusion() BUILTIN; +closure color principled_diffuse(normal N, float roughness) BUILTIN; +closure color principled_sheen(normal N) BUILTIN; +closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN; // BSSRDF closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN; closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN; closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN; +closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN; // Hair closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN; diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h deleted file mode 100644 index 9bfa71c75ef..00000000000 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_background_buffer_update kernel. - * This is the fourth kernel in the ray tracing logic, and the third - * of the path iteration kernels. This kernel takes care of rays that hit - * the background (sceneintersect kernel), and for the rays of - * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in - * the output buffer. This kernel also takes care of rays that have been determined - * to-be-regenerated. - * - * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel - * - * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER - * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state - * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output are as follows, - * - * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop - * throughput_coop --------------------------------------| |--- L_transparent_coop - * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers - * Ray_coop ---------------------------------------------| |--- ray_state - * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array - * parallel_samples -------------------------------------| |--- PathState_coop - * end_sample -------------------------------------------| |--- throughput_coop - * kg (globals) -----------------------------------------| |--- rng_coop - * rng_state --------------------------------------------| |--- Ray - * PathRadiance_coop ------------------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| |--- work_array - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| |--- work_pool_wgs - * work_pool_wgs ----------------------------------------| | - * num_samples ------------------------------------------| | - * - * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. - * Note on Queues : - * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - * - * State of queues when this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty - */ -ccl_device char kernel_background_buffer_update( - KernelGlobals *kg, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index) -{ - char enqueue_flag = 0; -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; -#endif - ccl_global PathState *state = &PathState_coop[ray_index]; - PathRadiance *L = L = &PathRadiance_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global float *L_transparent = &L_transparent_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - -#ifdef __WORK_STEALING__ - unsigned int my_work; - ccl_global float *initial_per_sample_output_buffers; - ccl_global uint *initial_rng; -#endif - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - my_sample_tile = 0; - initial_per_sample_output_buffers = per_sample_output_buffers; - initial_rng = rng_state; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - int tile_index = ray_index / parallel_samples; - /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, kg->sd_input, state, ray); - path_radiance_accum_background(L, (*throughput), L_background, state->bounce); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - } - - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); -#endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - /* We have completed current work; So get next work */ - int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - if(!valid_work) { - /* If work is invalid, this means no more work is available and the thread may exit */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#else /* __WORK_STEALING__ */ - if((sample + parallel_samples) >= end_sample) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#endif /* __WORK_STEALING__ */ - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - work_array[ray_index] = my_work; - /* Get the sample associated with the current work */ - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - /* Get pixel and tile position associated with current work */ - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; - - /* Remap rng_state according to the current work */ - rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = initial_per_sample_output_buffers - + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; -#else /* __WORK_STEALING__ */ - work_array[ray_index] = sample + parallel_samples; - sample = work_array[ray_index]; - - /* Get ray position from ray index */ - pixel_x = sx + ((ray_index / parallel_samples) % sw); - pixel_y = sy + ((ray_index / parallel_samples) / sw); -#endif /* __WORK_STEALING__ */ - - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); - - if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; - path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, kg->sd_input, state, rng, sample, ray); -#ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - enqueue_flag = 1; - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - } - } - return enqueue_flag; -} diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h new file mode 100644 index 00000000000..2313feac089 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -0,0 +1,219 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +/* sets up the various state needed to do an indirect loop */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* save a copy of the state to restore later */ +#define BRANCHED_STORE(name) \ + branched_state->name = kernel_split_state.name[ray_index]; + + BRANCHED_STORE(path_state); + BRANCHED_STORE(throughput); + BRANCHED_STORE(ray); + BRANCHED_STORE(sd); + BRANCHED_STORE(isect); + BRANCHED_STORE(ray_state); + +#undef BRANCHED_STORE + + /* set loop counters to intial position */ + branched_state->next_closure = 0; + branched_state->next_sample = 0; +} + +/* ends an indirect loop and restores the previous state */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* restore state */ +#define BRANCHED_RESTORE(name) \ + kernel_split_state.name[ray_index] = branched_state->name; + + BRANCHED_RESTORE(path_state); + BRANCHED_RESTORE(throughput); + BRANCHED_RESTORE(ray); + BRANCHED_RESTORE(sd); + BRANCHED_RESTORE(isect); + BRANCHED_RESTORE(ray_state); + +#undef BRANCHED_RESTORE + + /* leave indirect loop */ + REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); +} + +ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index); + + if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { + return false; + } + +#define SPLIT_DATA_ENTRY(type, name, num) \ + kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; + SPLIT_DATA_ENTRIES_BRANCHED_SHARED +#undef SPLIT_DATA_ENTRY + + kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; + kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; + kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; + + path_radiance_init(inactive_L, kernel_data.film.use_light_pass); + path_radiance_copy_indirect(inactive_L, L); + + ray_state[inactive_ray] = RAY_REGENERATED; + ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); + ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); + + atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count); + + return true; +} + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, + int ray_index, + float num_samples_adjust, + ShaderData *saved_sd, + bool reset_path_state, + bool wait_for_shared) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = saved_sd; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = branched_state->next_closure; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + + for(int j = branched_state->next_sample; j < num_samples; j++) { + if(reset_path_state) { + *ps = branched_state->path_state; + } + + ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = throughput; + + ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; + + if(!kernel_branched_path_surface_bounce(kg, + sd, + sc, + j, + num_samples, + tp, + ps, + &L->state, + bsdf_ray, + sum_sample_weight)) + { + continue; + } + + ps->rng_hash = branched_state->path_state.rng_hash; + + /* update state for next iteration */ + branched_state->next_closure = i; + branched_state->next_sample = j+1; + + /* start the indirect path */ + *tp *= num_samples_inv; + + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + + return true; + } + + branched_state->next_sample = 0; + } + + branched_state->next_closure = sd->num_closure; + + if(wait_for_shared) { + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + } + + return false; +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h new file mode 100644 index 00000000000..511334e0550 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -0,0 +1,154 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel takes care of rays that hit the background (sceneintersect + * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's + * accumulated radiance in the output buffer. This kernel also takes care of + * rays that have been determined to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel. + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. + * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put + * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queues when this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + */ +ccl_device void kernel_buffer_update(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(kg, ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + uint sample = state->sample; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + + /* accumulate result in output buffer */ + kernel_write_result(kg, buffer, sample, L); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + /* We have completed current work; So get next work */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; + uint work_index; + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); + + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; + + /* Initialize random numbers and ray. */ + uint rng_hash; + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, path radiance, Ray, PathState; + * These rays proceed with path-iteration. + */ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 6e158d53d23..77fb61b80a8 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -14,221 +14,96 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_data_initialization kernel - * This kernel Initializes structures needed in path-iteration kernels. - * This is the first kernel in ray-tracing logic. +/* This kernel Initializes structures needed in path-iteration kernels. * - * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE - * - * Its input and output are as follows, - * - * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng - * Un-initialized throughput -------| |--- Initialized throughput - * Un-initialized L_transparent ----| |--- Initialized L_transparent - * Un-initialized PathRadiance -----| |--- Initialized PathRadiance - * Un-initialized Ray --------------| |--- Initialized Ray - * Un-initialized PathState --------| |--- Initialized PathState - * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) - * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0) - * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) - * Un-initialized ray_state --------| |--- Initialized ray_state - * parallel_samples --------------- | |--- Initialized per_sample_output_buffers - * rng_state -----------------------| |--- Initialized work_array - * data ----------------------------| |--- Initialized work_pool_wgs - * start_sample --------------------| | - * sx ------------------------------| | - * sy ------------------------------| | - * sw ------------------------------| | - * sh ------------------------------| | - * stride --------------------------| | - * queuesize -----------------------| | - * num_samples ---------------------| | - * - * Note on Queues : + * Note on Queues: * All slots in queues are initialized to queue empty slot; * The number of elements in the queues is initialized to 0; */ + +#ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( +#else +void KERNEL_FUNCTION_FULL_NAME(data_init)( +#endif KernelGlobals *kg, - ShaderData *sd_DL_shadow, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../kernel_textures.h" - - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + +#ifdef __KERNEL_OPENCL__ + KERNEL_BUFFER_PARAMS, +#endif + + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global unsigned int *work_pools, /* Work pool for each work group */ + unsigned int num_samples, + ccl_global float *buffer) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, data_init); +#else + +#ifdef __KERNEL_OPENCL__ kg->data = data; - kg->sd_input = sd_DL_shadow; - kg->isect_shadow = Intersection_coop_shadow; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../kernel_textures.h" - - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - -#ifdef __WORK_STEALING__ - int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); - /* Initialize work_pool_wgs */ - if(lid == 0) { - int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); - work_pool_wgs[group_index] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif /* __WORK_STEALING__ */ +#endif - /* Initialize queue data and queue index. */ - if(thread_index < queuesize) { - /* Initialize active ray queue. */ - Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize background and buffer update queue. */ - Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of AO queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of direct lighting queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - } + kernel_split_params.tile.x = sx; + kernel_split_params.tile.y = sy; + kernel_split_params.tile.w = sw; + kernel_split_params.tile.h = sh; - if(thread_index == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - /* The scene-intersect kernel should not use the queues very first time. - * since the queue would be empty. - */ - use_queues_flag[0] = 0; - } + kernel_split_params.tile.start_sample = start_sample; + kernel_split_params.tile.num_samples = num_samples; - int x = get_global_id(0); - int y = get_global_id(1); + kernel_split_params.tile.offset = offset; + kernel_split_params.tile.stride = stride; - if(x < (sw * parallel_samples) && y < sh) { - int ray_index = x + y * (sw * parallel_samples); + kernel_split_params.tile.buffer = buffer; - /* This is the first assignment to ray_state; - * So we dont use ASSIGN_RAY_STATE macro. - */ - ray_state[ray_index] = RAY_ACTIVE; - - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - unsigned int my_work = 0; - /* Get work. */ - get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - /* Get the sample associated with the work. */ - my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - - my_sample_tile = 0; - - /* Get pixel and tile position associated with the work. */ - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - work_array[ray_index] = my_work; -#else /* __WORK_STEALING__ */ - unsigned int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); - my_sample = my_sample_tile + start_sample; - - /* Initialize work array. */ - work_array[ray_index] = my_sample ; - - /* Calculate pixel position of this ray. */ - pixel_x = sx + tile_x; - pixel_y = sy + tile_y; -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - - /* Initialise per_sample_output_buffers to all zeros. */ - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; - int per_sample_output_buffers_iterator = 0; - for(per_sample_output_buffers_iterator = 0; - per_sample_output_buffers_iterator < kernel_data.film.pass_stride; - per_sample_output_buffers_iterator++) - { - per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; - } + kernel_split_params.total_work_size = sw * sh * num_samples; + + kernel_split_params.work_pools = work_pools; - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, - rng_state, - my_sample, - pixel_x, pixel_y, - &rng_coop[ray_index], - &Ray_coop[ray_index]); - - if(Ray_coop[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - L_transparent_coop[ray_index] = 0.0f; - path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); - path_state_init(kg, - kg->sd_input, - &PathState_coop[ray_index], - &rng_coop[ray_index], - my_sample, - &Ray_coop[ray_index]); -#ifdef __KERNEL_DEBUG__ - debug_data_init(&debugdata_coop[ray_index]); + kernel_split_params.queue_index = Queue_index; + kernel_split_params.queue_size = queuesize; + kernel_split_params.use_queues_flag = use_queues_flag; + + split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); + +#ifdef __KERNEL_OPENCL__ + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); #endif - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); - path_rng_end(kg, rng_state, rng_coop[ray_index]); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + /* Initialize queue data and queue index. */ + if(thread_index < queuesize) { + for(int i = 0; i < NUM_QUEUES; i++) { + kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; } } - /* Mark rest of the ray-state indices as RAY_INACTIVE. */ - if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { - /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ - ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + if(thread_index == 0) { + for(int i = 0; i < NUM_QUEUES; i++) { + Queue_index[i] = 0; + } + + /* The scene-intersect kernel should not use the queues very first time. + * since the queue would be empty. + */ + *use_queues_flag = 0; } +#endif /* KERENL_STUB */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 82ca18829d3..2aac66ecb84 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -14,95 +14,136 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_direct_lighting kernel. - * This is the eighth kernel in the ray tracing logic. This is the seventh - * of the path iteration kernels. This kernel takes care of direct lighting - * logic. However, the "shadow ray cast" part of direct lighting is handled +/* This kernel takes care of direct lighting logic. + * However, the "shadow ray cast" part of direct lighting is handled * in the next kernel. * - * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. - * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with direct lighting should be executed. Those rays for which + * a shadow_blocked() function for direct-lighting must be executed, are + * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_DL_RAYS * - * The input and output are as follows, + * Note on Queues: + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue + * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute + * the corresponding shadow_blocked part, after direct lighting, the ray is + * marked with RAY_SHADOW_RAY_CAST_DL flag. * - * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop - * PathState_coop -----------------------------------| |--- ISLamp_coop - * sd -----------------------------------------------| |--- LightRay_coop - * ray_state ----------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -------------------------------------| | - * queuesize ----------------------------------------| | - * - * Note on Queues : - * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked - * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. - * - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this - * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + * State of queues when this kernel is called: + * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this + * kernel call. + * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a + * shadow_blocked function must be executed, after this kernel call + * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. */ -ccl_device char kernel_direct_lighting( - KernelGlobals *kg, - ShaderData *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_direct_lighting(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) { + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + char enqueue_flag = 0; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &PathState_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; /* direct lighting */ #ifdef __EMISSION__ - if((kernel_data.integrator.use_direct_light && - (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) - { + bool flag = (kernel_data.integrator.use_direct_light && + (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + +# ifdef __SHADOW_TRICKS__ + if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { + flag = false; + enqueue_flag = 1; + } +# endif /* __SHADOW_TRICKS__ */ + + if(flag) { /* Sample illumination from lights to find path contribution. */ - ccl_global RNG* rng = &rng_coop[ray_index]; - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; if(light_sample(kg, - light_t, light_u, light_v, - ccl_fetch(sd, time), - ccl_fetch(sd, P), + light_u, light_v, + sd->time, + sd->P, state->bounce, &ls)) { Ray light_ray; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif + light_ray.time = sd->time; BsdfEval L_light; bool is_lamp; - if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { + if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* Write intermediate data to global memory to access from * the next kernel. */ - LightRay_coop[ray_index] = light_ray; - BSDFEval_coop[ray_index] = L_light; - ISLamp_coop[ray_index] = is_lamp; + kernel_split_state.light_ray[ray_index] = light_ray; + kernel_split_state.bsdf_eval[ray_index] = L_light; + kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } } #endif /* __EMISSION__ */ } - return enqueue_flag; + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h new file mode 100644 index 00000000000..491487f1230 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) + +ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &kernel_split_state.sd[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + Ray volume_ray = branched_state->ray; + volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + *ps = branched_state->path_state; + + ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; + *pray = branched_state->ray; + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = branched_state->throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(ps, j, num_samples); + + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, ps, sd, &volume_ray, L, tp, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L); + + /* indirect light bounce */ + if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) { + continue; + } + + /* start the indirect path */ + branched_state->next_closure = 0; + branched_state->next_sample = j+1; + + /* Attempting to share too many samples is slow for volumes as it causes us to + * loop here more and have many calls to kernel_volume_integrate which evaluates + * shaders. The many expensive shader evaluations cause the work load to become + * unbalanced and many threads to become idle in this kernel. Limiting the + * number of shared samples here helps quite a lot. + */ + if(branched_state->shared_sample_count < 2) { + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + } + + return true; + } +# endif + } + + branched_state->next_sample = num_samples; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + /* todo: avoid this calculation using decoupled ray marching */ + float3 throughput = kernel_split_state.throughput[ray_index]; + kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); + kernel_split_state.throughput[ray_index] = throughput; + + return false; +} + +#endif /* __BRANCHED_PATH__ && __VOLUME__ */ + +ccl_device void kernel_do_volume(KernelGlobals *kg) +{ +#ifdef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; +# ifdef __BRANCHED_PATH__ + kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; +# endif /* __BRANCHED_PATH__ */ + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(*kernel_split_params.use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + } + + ccl_global char *ray_state = kernel_split_state.ray_state; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + /* volume attenuation, emission, scatter */ + if(state->volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + +# ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +# endif /* __BRANCHED_PATH__ */ + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_path_end(kg, ray_index); + } + } +# endif /* __VOLUME_SCATTER__ */ + } + +# ifdef __BRANCHED_PATH__ + } + else { + kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + } + } + +# ifdef __BRANCHED_PATH__ + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_VOLUME_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __VOLUME__ */ +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h new file mode 100644 index 00000000000..496355bbc3a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ +#ifdef __BRANCHED_PATH__ + /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + char enqueue_flag = 0; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_INACTIVE_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif /* __BRANCHED_PATH__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 435d1171d5c..906bad8ceb6 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -14,247 +14,161 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. - * This is the sixth kernel in the ray tracing logic. This is the fifth - * of the path iteration kernels. This kernel takes care of the logic to process - * "material of type holdout", indirect primitive emission, bsdf blurring, - * probabilistic path termination and AO. +/* This kernel takes care of the logic to process "material of type holdout", + * indirect primitive emission, bsdf blurring, probabilistic path termination + * and AO. * - * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. - * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with AO should be executed. Those rays for which a + * shadow_blocked() function for AO must be executed are marked with flag + * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_AO_RAYS * * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER * - * The input and output are as follows, + * Note on Queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFFER state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFFER, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- PathState_coop - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * Intersection_coop ------------------------------------| |--- L_transparent_coop - * PathState_coop ---------------------------------------| |--- per_sample_output_buffers - * L_transparent_coop -----------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- ShaderData - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop - * kg (globals) -----------------------------------------| |--- AOBSDF_coop - * parallel_samples -------------------------------------| |--- AOLightRay_coop - * per_sample_output_buffers ----------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| | - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| | - * - * Note on Queues : - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFFER, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with + * flag RAY_SHADOW_RAY_CAST_AO */ + ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, - ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index, - char *enqueue_flag, - char *enqueue_flag_AO_SHADOW_RAY_CAST) + ccl_local_param BackgroundAOLocals *locals) { -#ifdef __WORK_STEALING__ - unsigned int my_work; - unsigned int pixel_x; - unsigned int pixel_y; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + locals->queue_atomics_bg = 0; + locals->queue_atomics_ao = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + +#ifdef __AO__ + char enqueue_flag = 0; +#endif + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { #endif - unsigned int tile_x; - unsigned int tile_y; - int my_sample_tile; - unsigned int sample; - ccl_global RNG *rng = 0x0; ccl_global PathState *state = 0x0; float3 throughput; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global char *ray_state = kernel_split_state.ray_state; + ShaderData *sd = &kernel_split_state.sd[ray_index]; - throughput = throughput_coop[ray_index]; - state = &PathState_coop[ray_index]; - rng = &rng_coop[ray_index]; -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - my_sample_tile = 0; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - /* Buffer's stride is "stride"; Find x and y using ray_index. */ - int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - per_sample_output_buffers += - (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * - kernel_data.film.pass_stride; - - /* holdout */ -#ifdef __HOLDOUT__ - if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && - (state->flag & PATH_RAY_CAMERA)) + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + throughput = kernel_split_state.throughput[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - - if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - else - holdout_weight = shader_holdout_eval(kg, sd); - - /* any throughput is ok, should all be identical here */ - L_transparent_coop[ray_index] += average(holdout_weight*throughput); - } - - if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; - } + kernel_split_path_end(kg, ray_index); } -#endif /* __HOLDOUT__ */ } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - per_sample_output_buffers, - L, - sd, - sample, - state, - throughput); - /* Blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy. - */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(ccl_fetch(sd, flag) & SD_EMISSION) { - /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission( - kg, - sd, - Intersection_coop[ray_index].t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } -#endif /* __EMISSION__ */ - /* Path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ - float probability = path_state_terminate_probability(kg, state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); + } + else if(probability < 1.0f) { + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); + if(terminate >= probability) { + kernel_split_path_end(kg, ray_index); + } + else { + kernel_split_state.throughput[ray_index] = throughput/probability; + } } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); - if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; - } - else { - throughput_coop[ray_index] = throughput/probability; - } - } + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + kernel_update_denoising_features(kg, sd, state, L); } } #ifdef __AO__ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || - (ccl_fetch(sd, flag) & SD_AO)) - { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - _ray.time = ccl_fetch(sd, time); -#endif - _ray.dP = ccl_fetch(sd, dP); - _ray.dD = differential3_zero(); - AOLightRay_coop[ray_index] = _ray; - - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - *enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { + enqueue_flag = 1; } } #endif /* __AO__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &locals->queue_atomics_ao, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h new file mode 100644 index 00000000000..437043a5971 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_background(KernelGlobals *kg) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int ray_index; + + if(kernel_data.integrator.ao_bounces != INT_MAX) { + ray_index = get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index != QUEUE_EMPTY_SLOT) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + if(path_state_ao_bounce(kg, state)) { + kernel_split_path_end(kg, ray_index); + } + } + } + } + + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + kernel_path_background(kg, state, ray, throughput, emission_sd, L); + kernel_split_path_end(kg, ray_index); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h new file mode 100644 index 00000000000..e9fe5552e8c --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -0,0 +1,79 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + + int ray_index; + get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __SUBSURFACE__ + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +#ifdef __BRANCHED_PATH__ + } +#endif + +#endif /* __SUBSURFACE__ */ + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index 3bd0e361078..448456d167d 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -14,70 +14,55 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_lamp_emission - * This is the 3rd kernel in the ray-tracing logic. This is the second of the - * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. - * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE - * and RAY_HIT_BACKGROUND. +/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND. * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. - * The input/output of the kernel is as follows, - * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop - * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * kg (globals) ---------------------------------------| | - * Intersection_coop ----------------------------------| | - * ray_state ------------------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | - * queuesize ------------------------------------------| | - * use_queues_flag ------------------------------------| | - * sw -------------------------------------------------| | - * sh -------------------------------------------------| | */ -ccl_device void kernel_lamp_emission( - KernelGlobals *kg, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int ray_index) +ccl_device void kernel_lamp_emission(KernelGlobals *kg) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) - { - PathRadiance *L = &PathRadiance_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; +#ifndef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } +#endif + /* Fetch use_queues_flag. */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); - float3 throughput = throughput_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, +#ifndef __VOLUME__ + 1 +#else + 0 +#endif + ); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) + { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - light_ray.P = ray.P - state->ray_t*ray.D; - state->ray_t += Intersection_coop[ray_index].t; - light_ray.D = ray.D; - light_ray.t = state->ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - /* intersect with lamp */ - float3 emission; + float3 throughput = kernel_split_state.throughput[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) { - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L); } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 816f3a6fbff..c3373174582 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -14,128 +14,230 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_setup_next_iteration kernel. - * This is the tenth kernel in the ray tracing logic. This is the ninth - * of the path iteration kernels. This kernel takes care of setting up - * Ray for the next iteration of path-iteration and accumulating radiance - * corresponding to AO and direct-lighting +/*This kernel takes care of setting up ray for the next iteration of + * path-iteration and accumulating radiance corresponding to AO and + * direct-lighting * - * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * Ray state of rays that are terminated in this kernel are changed + * to RAY_UPDATE_BUFFER. * - * The input and output are as follows, + * Note on queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFF state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFF, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * PathState_coop ---------------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- PathState_coop - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag - * Ray_coop ---------------------------------------------| | - * kg (globals) -----------------------------------------| | - * LightRay_dl_coop -------------------------------------| - * ISLamp_coop ------------------------------------------| - * BSDFEval_coop ----------------------------------------| - * LightRay_ao_coop -------------------------------------| - * AOBSDF_coop ------------------------------------------| - * AOAlpha_coop -----------------------------------------| - * - * Note on queues, - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFF, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ -ccl_device char kernel_next_iteration_setup( - KernelGlobals *kg, - ShaderData *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ - int ray_index) + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) { - char enqueue_flag = 0; - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; - - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - float3 _throughput = throughput_coop[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = LightRay_ao_coop[ray_index].P; - char update_path_radiance = LightRay_ao_coop[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - AOAlpha_coop[ray_index], - AOBSDF_coop[ray_index], - shadow, - state->bounce); + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + } + else { + /* Update Path State */ + state->flag |= PATH_RAY_TRANSPARENT; + state->transparent_bounce++; + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ + } +} +#endif /* __BRANCHED_PATH__ */ + +ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + *kernel_split_params.use_queues_flag = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + ccl_global char *ray_state = kernel_split_state.ray_state; + + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ } + else { + kernel_split_branched_indirect_light_init(kg, ray_index); - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = LightRay_dl_coop[ray_index].P; - char update_path_radiance = LightRay_dl_coop[ray_index].t; - if(update_path_radiance) { - BsdfEval L_light = BSDFEval_coop[ray_index]; - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - ISLamp_coop[ray_index]); + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global RNG *rng = &rng_coop[ray_index]; - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; + } + + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } } - return enqueue_flag; +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h new file mode 100644 index 00000000000..5ad62b585fe --- /dev/null +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -0,0 +1,81 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + */ +ccl_device void kernel_path_init(KernelGlobals *kg) { + int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + /* This is the first assignment to ray_state; + * So we dont use ASSIGN_RAY_STATE macro. + */ + kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; + + /* Get work. */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; + uint work_index; + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { + /* No more work, mark ray as inactive */ + kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; + + return; + } + + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); + + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; + + /* Initialize random numbers and ray. */ + uint rng_hash; + kernel_path_trace_setup(kg, + sample, + x, y, + &rng_hash, + &kernel_split_state.ray[ray_index]); + + if(kernel_split_state.ray[ray_index].t != 0.0f) { + /* Initialize throughput, path radiance, Ray, PathState; + * These rays proceed with path-iteration. + */ + kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, + &kernel_split_state.sd_DL_shadow[ray_index], + &kernel_split_state.path_state[ray_index], + rng_hash, + sample, + &kernel_split_state.ray[ray_index]); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif + } + else { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h new file mode 100644 index 00000000000..66ce2dfb6f1 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -0,0 +1,91 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel enqueues rays of different ray state into their + * appropriate queues: + * + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel are enqueued in + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in pat + * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queue during other times this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE + * and RAY_UPDATE_BUFFER rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ +ccl_device void kernel_queue_enqueue(KernelGlobals *kg, + ccl_local_param QueueEnqueueLocals *locals) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(lidx == 0) { + locals->queue_atomics[0] = 0; + locals->queue_atomics[1] = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(lidx == 0) { + locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + kernel_split_params.queue_size, + my_lqidx, + locals->queue_atomics); + kernel_split_state.queue_data[my_gqidx] = ray_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index fc4b4ee38e5..f5378bc172b 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -14,119 +14,66 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_scene_intersect kernel. - * This is the second kernel in the ray tracing logic. This is the first - * of the path iteration kernels. This kernel takes care of scene_intersect function. +/* This kernel takes care of scene_intersect function. * * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. * This kernel processes rays of ray state RAY_ACTIVE - * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. - * - * The input and output are as follows, - * - * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState - * PathState_coop ---------------------------------| |--- Intersection - * ray_state --------------------------------------| |--- ray_state - * use_queues_flag --------------------------------| | - * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -----------------------------------| | - * rng_coop ---------------------------------------| | - * sw ---------------------------------------------| | - * sh ---------------------------------------------| | - * queuesize --------------------------------------| | - * - * Note on Queues : - * Ideally we would want kernel_scene_intersect to work on queues. - * But during the very first time, the queues will be empty and hence we perform a direct mapping - * between ray-index and thread-index; From the next time onward, the queue will be filled and - * we may start operating on queues. - * - * State of queue during the first time this kernel is called : - * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel - * - * State of queues during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; - * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These - * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing - * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from - * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays - * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and - * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + * This kernel determines the rays that have hit the background and changes + * their ray state to RAY_HIT_BACKGROUND. */ - -ccl_device void kernel_scene_intersect( - KernelGlobals *kg, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int ray_index) +ccl_device void kernel_scene_intersect(KernelGlobals *kg) { - /* All regenerated rays become active here */ - if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); - - if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) - return; + /* Fetch use_queues_flag */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; -#endif - Intersection *isect = &Intersection_coop[ray_index]; - PathState state = PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); - /* intersect scene */ - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - RNG rng = rng_coop[ray_index]; + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + /* All regenerated rays become active here */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { +#ifdef __BRANCHED_PATH__ + if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { + kernel_split_path_end(kg, ray_index); + } + else +#endif /* __BRANCHED_PATH__ */ + { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); } + } - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + return; } - bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f); -#endif + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data->num_bvh_traversal_steps += isect->num_traversal_steps; - debug_data->num_bvh_traversed_instances += isect->num_traversed_instances; - } - debug_data->num_ray_bounces++; -#endif + Intersection isect; + bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L); + kernel_split_state.isect[ray_index] = isect; if(!hit) { /* Change the state of rays that hit the background; * These rays undergo special processing in the * background_bufferUpdate kernel. */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index cef64bf5f36..7032461b04a 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,57 +14,53 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_shader_eval kernel - * This kernel is the 5th kernel in the ray tracing logic. This is - * the 4rd kernel in path iteration. This kernel sets up the ShaderData - * structure from the values computed by the previous kernels. It also identifies - * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - * - * The input and output of the kernel is as follows, - * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd - * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Intersection_coop ----------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | - * ray_state ------------------------------------------| | - * kg (globals) ---------------------------------------| | - * queuesize ------------------------------------------| | - * - * Note on Queues : - * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; - * State of queues when this kernel is called, - * at entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * at exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays +/* This kernel evaluates ShaderData structure from the values computed + * by the previous kernels. */ -ccl_device void kernel_shader_eval( - KernelGlobals *kg, - ShaderData *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_shader_eval(KernelGlobals *kg) { + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + /* Sorting on cuda split is not implemented */ +#ifdef __KERNEL_CUDA__ + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; +#else + int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; +#endif + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, +#ifdef __KERNEL_CUDA__ + QUEUE_ACTIVE_AND_REGENERATED_RAYS, +#else + QUEUE_SHADER_SORTED_RAYS, +#endif + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global char *ray_state = kernel_split_state.ray_state; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - Intersection *isect = &Intersection_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - shader_setup_from_ray(kg, - sd, - isect, - &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag); +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) { + shader_merge_closures(&kernel_split_state.sd[ray_index]); + } + else +#endif + { + shader_prepare_closures(&kernel_split_state.sd[ray_index], state); + } } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h new file mode 100644 index 00000000000..0432689d9fa --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_setup.h @@ -0,0 +1,70 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. + * + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + */ +ccl_device void kernel_shader_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + + shader_setup_from_ray(kg, + &kernel_split_state.sd[ray_index], + &isect, + &ray); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h new file mode 100644 index 00000000000..5a55b680695 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_shader_sort(KernelGlobals *kg, + ccl_local_param ShaderSortLocals *locals) +{ +#ifndef __KERNEL_CUDA__ + int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(tid == 0) { + kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; + } + + uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE; + if(offset >= qsize) { + return; + } + + int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); + uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); + ccl_local uint *local_value = &locals->local_value[0]; + ccl_local ushort *local_index = &locals->local_index[0]; + + /* copy to local memory */ + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint add = input + idx; + uint value = (~0); + if(idx < qsize) { + int ray_index = kernel_split_state.queue_data[add]; + bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(valid) { + value = kernel_split_state.sd[ray_index].shader & SHADER_MASK; + } + } + local_value[i + lid] = value; + local_index[i + lid] = i + lid; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* skip sorting for cpu split kernel */ +# ifdef __KERNEL_OPENCL__ + + /* bitonic sort */ + for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for(uint inc = length; inc > 0; inc >>= 1) { + for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + uint i = lid + ii; + bool direction = ((i & (length << 1)) != 0); + uint j = i ^ inc; + ushort ioff = local_index[i]; + ushort joff = local_index[j]; + uint iKey = local_value[ioff]; + uint jKey = local_value[joff]; + bool smaller = (jKey < iKey) || (jKey == iKey && j < i); + bool swap = smaller ^ (j < i) ^ direction; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + local_index[i] = (swap) ? joff : ioff; + local_index[j] = (swap) ? ioff : joff; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + } + } + } +# endif /* __KERNEL_OPENCL__ */ + + /* copy to destination */ + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint lidx = local_index[i + lid]; + uint outi = output + idx; + uint ini = input + offset + lidx; + uint value = local_value[lidx]; + if(idx < qsize) { + kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; + } + } +#endif /* __KERNEL_CUDA__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h deleted file mode 100644 index 6153af47f96..00000000000 --- a/intern/cycles/kernel/split/kernel_shadow_blocked.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_shadow_blocked kernel. - * This is the ninth kernel in the ray tracing logic. This is the eighth - * of the path iteration kernels. This kernel takes care of "shadow ray cast" - * logic of the direct lighting and AO part of ray tracing. - * - * The input and output are as follows, - * - * PathState_coop ----------------------------------|--- kernel_shadow_blocked --| - * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop - * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop - * ray_state ---------------------------------------| |--- ray_state - * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * kg (globals) ------------------------------------| | - * queuesize ---------------------------------------| | - * - * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself. - * Note on queues : - * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty - * these queues this kernel. - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO - * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. - */ -ccl_device void kernel_shadow_blocked( - KernelGlobals *kg, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - char shadow_blocked_type, - int ray_index) -{ - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - ccl_global PathState *state = &PathState_coop[ray_index]; - ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; - ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; - - ccl_global Ray *light_ray_global = - shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO - ? light_ray_ao_global - : light_ray_dl_global; - - float3 shadow; - update_path_radiance = !(shadow_blocked(kg, - kg->sd_input, - state, - light_ray_global, - &shadow)); - - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; - } -} diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h new file mode 100644 index 00000000000..79aa2c9435b --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for AO. */ +ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) +{ + unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd)); +#ifdef __BRANCHED_PATH__ + } + else { + kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput); + } +#endif +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h new file mode 100644 index 00000000000..b52f9a5eb81 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -0,0 +1,107 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for direct visible light. */ +ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) +{ + unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < dl_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + +#ifdef __BRANCHED_PATH__ + /* TODO(mai): move this somewhere else? */ + if(thread_index == 0) { + /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ + kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; + } +#endif /* __BRANCHED_PATH__ */ + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.light_ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + bool is_lamp = kernel_split_state.is_lamp[ray_index]; + +# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) + bool use_branched = false; + int all = 0; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + use_branched = true; + all = 1; + } +# if defined(__BRANCHED_PATH__) + else if(kernel_data.integrator.branched) { + use_branched = true; + + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + all = (kernel_data.integrator.sample_all_lights_indirect); + } + else + { + all = (kernel_data.integrator.sample_all_lights_direct); + } + } +# endif /* __BRANCHED_PATH__ */ + + if(use_branched) { + kernel_branched_path_surface_connect_light(kg, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + all); + } + else +# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ + { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, + sd, + emission_sd, + state, + &ray, + &shadow)) + { + /* accumulate */ + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 2135ee22b2e..21886ee62ee 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -17,48 +17,78 @@ #ifndef __KERNEL_SPLIT_H__ #define __KERNEL_SPLIT_H__ -#include "kernel_compat_opencl.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_image_opencl.h" - -#include "util_atomic.h" - -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" - -#include "geom/geom.h" -#include "bvh/bvh.h" - -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" - -#ifdef __SUBSURFACE__ -#include "kernel_subsurface.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" + +#include "kernel/split/kernel_split_data.h" + +#include "kernel/kernel_globals.h" + +#ifdef __OSL__ +# include "kernel/osl/osl_shader.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "kernel/kernels/opencl/kernel_opencl_image.h" +#endif +#ifdef __KERNEL_CUDA__ +# include "kernel/kernels/cuda/kernel_cuda_image.h" +#endif +#ifdef __KERNEL_CPU__ +# include "kernel/kernels/cpu/kernel_cpu_image.h" +#endif + +#include "util/util_atomic.h" + +#include "kernel/kernel_path.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/kernel_path_branched.h" #endif -#ifdef __VOLUME__ -#include "kernel_volume.h" +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" + +#ifdef __BRANCHED_PATH__ +# include "kernel/split/kernel_branched.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + +#ifdef __BRANCHED_PATH__ + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { + int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; + + path_radiance_sum_indirect(L); + path_radiance_accum_sample(orig_ray_L, L); + + atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); -#ifdef __KERNEL_DEBUG__ -#include "kernel_debug.h" + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } +#else + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); #endif +} -#include "kernel_queues.h" -#include "kernel_work_stealing.h" +CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h new file mode 100644 index 00000000000..eac22050a38 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_H__ +#define __KERNEL_SPLIT_DATA_H__ + +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) +{ + (void)kg; /* Unused on CPU. */ + + uint64_t size = 0; +#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) + size = size SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + + return size; +} + +ccl_device_inline void split_data_init(KernelGlobals *kg, + ccl_global SplitData *split_data, + size_t num_elements, + ccl_global void *data, + ccl_global char *ray_state) +{ + (void)kg; /* Unused on CPU. */ + + ccl_global char *p = (ccl_global char*)data; + +#define SPLIT_DATA_ENTRY(type, name, num) \ + split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16); + SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + + split_data->ray_state = ray_state; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h new file mode 100644 index 00000000000..b0e6e5f5250 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -0,0 +1,175 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_TYPES_H__ +#define __KERNEL_SPLIT_DATA_TYPES_H__ + +CCL_NAMESPACE_BEGIN + +/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */ + +typedef struct SplitParams { + WorkTile tile; + uint total_work_size; + + ccl_global unsigned int *work_pools; + + ccl_global int *queue_index; + int queue_size; + ccl_global char *use_queues_flag; + + /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ + int dummy_sd_flag; +} SplitParams; + +/* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + +/* SPLIT_DATA_ENTRY(type, name, num) */ + +#ifdef __BRANCHED_PATH__ + +typedef ccl_global struct SplitBranchedState { + /* various state that must be kept and restored after an indirect loop */ + PathState path_state; + float3 throughput; + Ray ray; + + struct ShaderData sd; + Intersection isect; + + char ray_state; + + /* indirect loop state */ + int next_closure; + int next_sample; + +#ifdef __SUBSURFACE__ + int ss_next_closure; + int ss_next_sample; + int next_hit; + int num_hits; + + uint lcg_state; + SubsurfaceIntersection ss_isect; + +# ifdef __VOLUME__ + VolumeStack volume_stack[VOLUME_STACK_SIZE]; +# endif /* __VOLUME__ */ +#endif /*__SUBSURFACE__ */ + + int shared_sample_count; /* number of branched samples shared with other threads */ + int original_ray; /* index of original ray when sharing branched samples */ + bool waiting_on_shared_samples; +} SplitBranchedState; + +#define SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) +#else +#define SPLIT_DATA_BRANCHED_ENTRIES +#endif /* __BRANCHED_PATH__ */ + +#ifdef __SUBSURFACE__ +# define SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) +#else +# define SPLIT_DATA_SUBSURFACE_ENTRIES +#endif /* __SUBSURFACE__ */ + +#ifdef __VOLUME__ +# define SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) +#else +# define SPLIT_DATA_VOLUME_ENTRIES +#endif /* __VOLUME__ */ + +#define SPLIT_DATA_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ + SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + +/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ +#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + +/* struct that holds pointers to data in the shared state buffer */ +typedef struct SplitData { +#define SPLIT_DATA_ENTRY(type, name, num) type *name; + SPLIT_DATA_ENTRIES +#undef SPLIT_DATA_ENTRY + + /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from + * the host easily) but is still used the same as the other data so we have it here in this struct as well + */ + ccl_global char *ray_state; +} SplitData; + +#ifndef __KERNEL_CUDA__ +# define kernel_split_state (kg->split_data) +# define kernel_split_params (kg->split_param_data) +#else +__device__ SplitData __split_data; +# define kernel_split_state (__split_data) +__device__ SplitParams __split_param_data; +# define kernel_split_params (__split_param_data) +#endif /* __KERNEL_CUDA__ */ + +/* Local storage for queue_enqueue kernel. */ +typedef struct QueueEnqueueLocals { + uint queue_atomics[2]; +} QueueEnqueueLocals; + +/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */ +typedef struct BackgroundAOLocals { + uint queue_atomics_bg; + uint queue_atomics_ao; +} BackgroundAOLocals; + +typedef struct ShaderSortLocals { + uint local_value[SHADER_SORT_BLOCK_SIZE]; + ushort local_index[SHADER_SORT_BLOCK_SIZE]; +} ShaderSortLocals; + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h new file mode 100644 index 00000000000..3b957856aea --- /dev/null +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -0,0 +1,313 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) + +ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + branched_state->ss_next_closure = 0; + branched_state->ss_next_sample = 0; + + branched_state->num_hits = 0; + branched_state->next_hit = 0; + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &branched_state->sd; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && + branched_state->next_closure == 0 && branched_state->next_sample == 0) + { + branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state, + 0x68bc21eb); + } + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = branched_state->ss_next_sample; j < num_samples; j++) { + ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, + bssrdf_rng_hash, + &branched_state->path_state, + j, + num_samples, + PRNG_BSDF_U, + &bssrdf_u, + &bssrdf_v); + + /* intersection is expensive so avoid doing multiple times for the same input */ + if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { + uint lcg_state = branched_state->lcg_state; + SubsurfaceIntersection ss_isect_private; + + branched_state->num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect_private, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + true); + + branched_state->lcg_state = lcg_state; + *ss_isect = ss_isect_private; + } + +#ifdef __VOLUME__ + Ray volume_ray = branched_state->ray; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { + ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index]; + *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is + * important as the indirect path will write into bssrdf_sd */ + + SubsurfaceIntersection ss_isect_private = *ss_isect; + subsurface_scatter_multi_setup(kg, + &ss_isect_private, + hit, + bssrdf_sd, + &branched_state->path_state, + branched_state->path_state.flag, + sc, + true); + *ss_isect = ss_isect_private; + + ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; + *hit_state = branched_state->path_state; + + path_state_branch(hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); + volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + + /* this next part is expensive as it does scene intersection so only do once */ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + branched_state->volume_stack[k] = hit_state->volume_stack[k]; + } + + kernel_volume_stack_update_for_subsurface(kg, + emission_sd, + &volume_ray, + branched_state->volume_stack); + } + + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state->volume_stack[k] = branched_state->volume_stack[k]; + } + } +#endif /* __VOLUME__ */ + +#ifdef __EMISSION__ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + bssrdf_sd, + emission_sd, + hit_state, + branched_state->throughput, + num_samples_inv, + L, + all); + } + } +#endif /* __EMISSION__ */ + + /* indirect light */ + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + num_samples_inv, + bssrdf_sd, + false, + false)) + { + branched_state->ss_next_closure = i; + branched_state->ss_next_sample = j; + branched_state->next_hit = hit; + + return true; + } + + branched_state->next_closure = 0; + } + + branched_state->next_hit = 0; + } + + branched_state->ss_next_sample = 0; + } + + branched_state->ss_next_closure = sd->num_closure; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + return false; +} + +#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __SUBSURFACE__ + ccl_global char *ray_state = kernel_split_state.ray_state; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + if(sd->flag & SD_BSSRDF) { + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + ray, + throughput, + ss_indirect)) + { + kernel_split_path_end(kg, ray_index); + } +#ifdef __BRANCHED_PATH__ + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); + + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); + subsurface_scatter_step(kg, + sd, + state, + state->flag, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); + } + } + else { + kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +#endif + } + } + +# ifdef __BRANCHED_PATH__ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; + } + + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_SUBSURFACE_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __SUBSURFACE__ */ + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h deleted file mode 100644 index a21e9b6a0b1..00000000000 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../kernel_compat_opencl.h" -#include "../kernel_math.h" -#include "../kernel_types.h" -#include "../kernel_globals.h" - -/* Since we process various samples in parallel; The output radiance of different samples - * are stored in different locations; This kernel combines the output radiance contributed - * by all different samples and stores them in the RenderTile's output buffer. - */ -ccl_device void kernel_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < sw && y < sh) { - buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); - per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); - - int sample_stride = (data->film.pass_stride); - - int sample_iterator = 0; - int pass_stride_iterator = 0; - int num_floats = data->film.pass_stride; - - for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { - for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { - *(buffer + pass_stride_iterator) = - (start_sample == 0 && sample_iterator == 0) - ? *(per_sample_output_buffer + pass_stride_iterator) - : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); - } - per_sample_output_buffer += sample_stride; - } - } -} diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 88ec7fe6fcc..d748e76fa80 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -39,7 +39,7 @@ * mostly taken care of in the SVM compiler. */ -#include "svm_types.h" +#include "kernel/svm/svm_types.h" CCL_NAMESPACE_BEGIN @@ -139,49 +139,49 @@ CCL_NAMESPACE_END /* Nodes */ -#include "svm_noise.h" +#include "kernel/svm/svm_noise.h" #include "svm_texture.h" -#include "svm_color_util.h" -#include "svm_math_util.h" - -#include "svm_attribute.h" -#include "svm_gradient.h" -#include "svm_blackbody.h" -#include "svm_closure.h" -#include "svm_noisetex.h" -#include "svm_convert.h" -#include "svm_displace.h" -#include "svm_fresnel.h" -#include "svm_wireframe.h" -#include "svm_wavelength.h" -#include "svm_camera.h" -#include "svm_geometry.h" -#include "svm_hsv.h" -#include "svm_image.h" -#include "svm_gamma.h" -#include "svm_brightness.h" -#include "svm_invert.h" -#include "svm_light_path.h" -#include "svm_magic.h" -#include "svm_mapping.h" -#include "svm_normal.h" -#include "svm_wave.h" -#include "svm_math.h" -#include "svm_mix.h" -#include "svm_ramp.h" -#include "svm_sepcomb_hsv.h" -#include "svm_sepcomb_vector.h" -#include "svm_musgrave.h" -#include "svm_sky.h" -#include "svm_tex_coord.h" -#include "svm_value.h" -#include "svm_voronoi.h" -#include "svm_checker.h" -#include "svm_brick.h" -#include "svm_vector_transform.h" -#include "svm_voxel.h" -#include "svm_bump.h" +#include "kernel/svm/svm_color_util.h" +#include "kernel/svm/svm_math_util.h" + +#include "kernel/svm/svm_attribute.h" +#include "kernel/svm/svm_gradient.h" +#include "kernel/svm/svm_blackbody.h" +#include "kernel/svm/svm_closure.h" +#include "kernel/svm/svm_noisetex.h" +#include "kernel/svm/svm_convert.h" +#include "kernel/svm/svm_displace.h" +#include "kernel/svm/svm_fresnel.h" +#include "kernel/svm/svm_wireframe.h" +#include "kernel/svm/svm_wavelength.h" +#include "kernel/svm/svm_camera.h" +#include "kernel/svm/svm_geometry.h" +#include "kernel/svm/svm_hsv.h" +#include "kernel/svm/svm_image.h" +#include "kernel/svm/svm_gamma.h" +#include "kernel/svm/svm_brightness.h" +#include "kernel/svm/svm_invert.h" +#include "kernel/svm/svm_light_path.h" +#include "kernel/svm/svm_magic.h" +#include "kernel/svm/svm_mapping.h" +#include "kernel/svm/svm_normal.h" +#include "kernel/svm/svm_wave.h" +#include "kernel/svm/svm_math.h" +#include "kernel/svm/svm_mix.h" +#include "kernel/svm/svm_ramp.h" +#include "kernel/svm/svm_sepcomb_hsv.h" +#include "kernel/svm/svm_sepcomb_vector.h" +#include "kernel/svm/svm_musgrave.h" +#include "kernel/svm/svm_sky.h" +#include "kernel/svm/svm_tex_coord.h" +#include "kernel/svm/svm_value.h" +#include "kernel/svm/svm_voronoi.h" +#include "kernel/svm/svm_checker.h" +#include "kernel/svm/svm_brick.h" +#include "kernel/svm/svm_vector_transform.h" +#include "kernel/svm/svm_voxel.h" +#include "kernel/svm/svm_bump.h" CCL_NAMESPACE_BEGIN @@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = ccl_fetch(sd, shader) & SHADER_MASK; + int offset = sd->shader & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 0e55c99ae97..229a3f20421 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData AttributeDescriptor desc; - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { desc = find_attribute(kg, sd, node.y); if(desc.offset == ATTR_STD_NOT_FOUND) { desc = attribute_not_found(); diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index b750ad87b7f..51590b18505 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -41,8 +41,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta float3 color_rgb = svm_math_blackbody_color(temperature); - if(stack_valid(col_offset)) - stack_store_float3(stack, col_offset, color_rgb); + stack_store_float3(stack, col_offset, color_rgb); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h index 04a8c7b64e5..610d9af9e1f 100644 --- a/intern/cycles/kernel/svm/svm_bump.h +++ b/intern/cycles/kernel/svm/svm_bump.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* save state */ - stack_store_float3(stack, offset+0, ccl_fetch(sd, P)); - stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx); - stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy); + stack_store_float3(stack, offset+0, sd->P); + stack_store_float3(stack, offset+3, sd->dP.dx); + stack_store_float3(stack, offset+6, sd->dP.dy); /* set state as if undisplaced */ const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED); @@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa object_dir_transform(kg, sd, &dPdx); object_dir_transform(kg, sd, &dPdy); - ccl_fetch(sd, P) = P; - ccl_fetch(sd, dP).dx = dPdx; - ccl_fetch(sd, dP).dy = dPdy; + sd->P = P; + sd->dP.dx = dPdx; + sd->dP.dy = dPdy; } } ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* restore state */ - ccl_fetch(sd, P) = stack_load_float3(stack, offset+0); - ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3); - ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6); + sd->P = stack_load_float3(stack, offset+0); + sd->dP.dx = stack_load_float3(stack, offset+3); + sd->dP.dy = stack_load_float3(stack, offset+6); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 00678a49d70..90249dfd978 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, ccl_fetch(sd, P)); + vector = transform_point(&tfm, sd->P); zdepth = vector.z; distance = len(vector); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 017d697f9f8..4268813b263 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = 0.0f; - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); } } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { @@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); } else { bsdf->alpha_x = roughness; @@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); } } @@ -70,14 +70,353 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(mix_weight == 0.0f) return; - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); switch(type) { +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_ID: { + uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset, + sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, + anisotropic_rotation_offset, transmission_roughness_offset; + uint4 data_node2 = read_node(kg, offset); + + float3 T = stack_load_float3(stack, data_node.y); + decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset); + decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset); + decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset); + + // get Disney principled parameters + float metallic = param1; + float subsurface = param2; + float specular = stack_load_float(stack, specular_offset); + float roughness = stack_load_float(stack, roughness_offset); + float specular_tint = stack_load_float(stack, specular_tint_offset); + float anisotropic = stack_load_float(stack, anisotropic_offset); + float sheen = stack_load_float(stack, sheen_offset); + float sheen_tint = stack_load_float(stack, sheen_tint_offset); + float clearcoat = stack_load_float(stack, clearcoat_offset); + float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset); + float transmission = stack_load_float(stack, transmission_offset); + float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset); + float transmission_roughness = stack_load_float(stack, transmission_roughness_offset); + float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f); + + ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + + /* rotate tangent */ + if(anisotropic_rotation != 0.0f) + T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F); + + /* calculate ior */ + float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta; + + // calculate fresnel for refraction + float cosNO = dot(N, sd->I); + float fresnel = fresnel_dielectric_cos(cosNO, ior); + + // calculate weights of the diffuse and specular part + float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission)); + + float final_transmission = saturate(transmission) * (1.0f - saturate(metallic)); + float specular_weight = (1.0f - final_transmission); + + // get the base color + uint4 data_base_color = read_node(kg, offset); + float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) : + make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w)); + + // get the additional clearcoat normal and subsurface scattering radius + uint4 data_cn_ssr = read_node(kg, offset); + float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; + float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); + + // get the subsurface color + uint4 data_subsurface_color = read_node(kg, offset); + float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : + make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w)); + + float3 weight = sd->svm_closure_weight * mix_weight; + +#ifdef __SUBSURFACE__ + float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface); + float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight; + float subsurf_sample_weight = fabsf(average(subsurf_weight)); + + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { + subsurface = 0.0f; + + /* need to set the base color in this case such that the + * rays get the correctly mixed color after transmitting + * the object */ + base_color = mixed_ss_base_color; + } + + /* diffuse */ + if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) { + if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } + else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) { + /* radius * scale */ + float3 radius = subsurface_radius * subsurface; + /* sharpness */ + float sharpness = 0.0f; + /* texture color blur */ + float texture_blur = 0.0f; + + /* create one closure per color channel */ + Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.x; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.x; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.y; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.y; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.z; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.z; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + } + } +#else + /* diffuse */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } +#endif + + /* sheen */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) { + float m_cdlum = linear_rgb_to_gray(base_color); + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat + + /* color of the sheen component */ + float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint; + + float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight; + + PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight); + + if(bsdf) { + bsdf->N = N; + + /* setup bsdf */ + sd->flag |= bsdf_principled_sheen_setup(bsdf); + } + } + + /* specular reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) { + float3 spec_weight = weight * specular_weight; + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f; + bsdf->T = T; + bsdf->extra = extra; + + float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); + float r2 = roughness * roughness; + + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; + + float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat + float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint; + + bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic; + bsdf->extra->color = base_color; + + /* setup bsdf */ + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); + else /* use multi-scatter GGX */ + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* BSDF */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(final_transmission > CLOSURE_WEIGHT_CUTOFF) { + float3 glass_weight = weight * final_transmission; + float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint); + + if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */ + float refl_roughness = roughness; + + /* reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + + bsdf->alpha_x = refl_roughness * refl_roughness; + bsdf->alpha_y = refl_roughness * refl_roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); + } + } + + /* refraction */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel)); + + if(bsdf) { + bsdf->N = N; + + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) + transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness); + else + transmission_roughness = refl_roughness; + + bsdf->alpha_x = transmission_roughness * transmission_roughness; + bsdf->alpha_y = transmission_roughness * transmission_roughness; + bsdf->ior = ior; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); + } + } + } + else { /* use multi-scatter GGX */ + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + + bsdf->alpha_x = roughness * roughness; + bsdf->alpha_y = roughness * roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); + } + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* clearcoat */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(clearcoat > CLOSURE_WEIGHT_CUTOFF) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = clearcoat_normal; + bsdf->ior = 1.5f; + bsdf->extra = extra; + + bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + break; + } +#endif /* __PRINCIPLED__ */ case CLOSURE_BSDF_DIFFUSE_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); if(bsdf) { @@ -86,31 +425,32 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float roughness = param1; if(roughness == 0.0f) { - ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); + sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); } else { bsdf->roughness = roughness; - ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf); + sd->flag |= bsdf_oren_nayar_setup(bsdf); } } break; } case CLOSURE_BSDF_TRANSLUCENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); if(bsdf) { bsdf->N = N; - ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf); + sd->flag |= bsdf_translucent_setup(bsdf); } break; } case CLOSURE_BSDF_TRANSPARENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + bsdf->N = N; + sd->flag |= bsdf_transparent_setup(bsdf); } break; } @@ -123,7 +463,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -135,21 +475,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) { kernel_assert(stack_valid(data_node.z)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.z); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf); } break; @@ -161,7 +501,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -169,7 +509,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra = NULL; float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -177,7 +517,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_y = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_x = param1; @@ -185,9 +525,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); } } @@ -203,14 +543,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, ccl_fetch(sd, I)); + float cosNO = dot(N, sd->I); float fresnel = fresnel_dielectric_cos(cosNO, eta); float roughness = param1; @@ -249,7 +589,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); @@ -261,13 +601,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_x = param1; bsdf->alpha_y = param1; float eta = fmaxf(param2, 1e-5f); - bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; kernel_assert(stack_valid(data_node.z)); bsdf->extra->color = stack_load_float3(stack, data_node.z); /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); } break; @@ -280,7 +620,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -310,33 +650,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = 0.0f; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) { kernel_assert(stack_valid(data_node.w)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.w); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); } break; } case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight); if(bsdf) { bsdf->N = N; bsdf->sigma = saturate(param1); - ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf); + sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf); } break; } @@ -344,9 +684,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __CAUSTICS_TRICKS__ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; + ATTR_FALLTHROUGH; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight); if(bsdf) { @@ -355,34 +696,36 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->smooth = param2; if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf); + sd->flag |= bsdf_diffuse_toon_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf); + sd->flag |= bsdf_glossy_toon_setup(bsdf); } break; } #ifdef __HAIR__ case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; - if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; /* todo: giving a fixed weight here will cause issues when * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we * better figure out a way to skip backfaces from rays * spawned by transmission from the front */ bsdf->weight = make_float3(1.0f, 1.0f, 1.0f); - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + sd->flag |= bsdf_transparent_setup(bsdf); } } else { HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight); if(bsdf) { + bsdf->N = N; bsdf->roughness1 = param1; bsdf->roughness2 = param2; bsdf->offset = -stack_load_float(stack, data_node.z); @@ -390,18 +733,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(stack_valid(data_node.y)) { bsdf->T = normalize(stack_load_float3(stack, data_node.y)); } - else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { - bsdf->T = normalize(ccl_fetch(sd, dPdv)); + else if(!(sd->type & PRIMITIVE_ALL_CURVE)) { + bsdf->T = normalize(sd->dPdv); bsdf->offset = 0.0f; } else - bsdf->T = normalize(ccl_fetch(sd, dPdu)); + bsdf->T = normalize(sd->dPdu); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf); + sd->flag |= bsdf_hair_reflection_setup(bsdf); } else { - ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf); + sd->flag |= bsdf_hair_transmission_setup(bsdf); } } } @@ -414,8 +757,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: case CLOSURE_BSSRDF_BURLEY_ID: { - float3 albedo = ccl_fetch(sd, svm_closure_weight); - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 albedo = sd->svm_closure_weight; + float3 weight = sd->svm_closure_weight * mix_weight; float sample_weight = fabsf(average(weight)); /* disable in case of diffuse ancestor, can't see it well then and @@ -441,7 +784,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -452,7 +795,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -463,7 +806,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } @@ -493,21 +836,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float switch(type) { case CLOSURE_VOLUME_ABSORPTION_ID: { - float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density; + float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density; ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight); if(sc) { - ccl_fetch(sd, flag) |= volume_absorption_setup(sc); + sd->flag |= volume_absorption_setup(sc); } break; } case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density; + float3 weight = sd->svm_closure_weight * mix_weight * density; HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight); if(volume) { volume->g = param2; /* g */ - ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume); + sd->flag |= volume_henyey_greenstein_setup(volume); } break; } @@ -527,12 +870,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_EMISSION; + sd->flag |= SD_EMISSION; } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) @@ -545,10 +888,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight); } ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) @@ -561,12 +904,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_HOLDOUT; + sd->flag |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -579,19 +922,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_AO; + sd->flag |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - ccl_fetch(sd, svm_closure_weight) = weight; + sd->svm_closure_weight = weight; } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -641,7 +984,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - ccl_fetch(sd, N) = normal; + sd->N = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 890ab41aaaa..656357be52d 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac uint normal_offset, distance_offset, invert, use_object_space; decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; - float3 dPdx = ccl_fetch(sd, dP).dx; - float3 dPdy = ccl_fetch(sd, dP).dy; + float3 dPdx = sd->dP.dx; + float3 dPdy = sd->dP.dy; if(use_object_space) { object_inverse_normal_transform(kg, sd, &normal_in); @@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); @@ -80,14 +85,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo { float d = stack_load_float(stack, fac_offset); - float3 dP = ccl_fetch(sd, N); + float3 dP = sd->N; object_inverse_normal_transform(kg, sd, &dP); dP *= d*0.1f; /* todo: get rid of this factor */ object_dir_transform(kg, sd, &dP); - ccl_fetch(sd, P) += dP; + sd->P += dP; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 23c97d80cb0..3703ec55015 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; eta = fmaxf(eta, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; + eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); } else { - f = fabsf(dot(ccl_fetch(sd, I), normal_in)); + f = fabsf(dot(sd->I, normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 7d512f7ff4d..cce4e89e715 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -27,16 +27,17 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P); break; - case NODE_GEOM_N: data = ccl_fetch(sd, N); break; + case NODE_GEOM_P: data = sd->P; break; + case NODE_GEOM_N: data = sd->N; break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = ccl_fetch(sd, I); break; - case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; + case NODE_GEOM_I: data = sd->I; break; + case NODE_GEOM_Ng: data = sd->Ng; break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; + case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); @@ -48,8 +49,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -65,8 +66,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -87,9 +88,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break; default: data = 0.0f; break; } @@ -106,44 +107,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -165,7 +166,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; + data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } @@ -177,7 +178,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, break; } /*case NODE_INFO_CURVE_FADE: { - data = ccl_fetch(sd, curve_transparency); + data = sd->curve_transparency; stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 2afdf61b476..4226e7adfe0 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,186 +16,25 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { -#ifdef __KERNEL_CPU__ -# ifdef __KERNEL_SSE2__ - ssef r_ssef; - float4 &r = (float4 &)r_ssef; - r = kernel_tex_image_interp(id, x, y); -# else - float4 r = kernel_tex_image_interp(id, x, y); -# endif -#elif defined(__KERNEL_OPENCL__) float4 r = kernel_tex_image_interp(kg, id, x, y); -#else - float4 r; - -# if __CUDA_ARCH__ < 300 - /* not particularly proud of this massive switch, what are the - * alternatives? - * - use a single big 1D texture, and do our own lookup/filtering - * - group by size and use a 3d texture, performance impact - * - group into larger texture with some padding for correct lerp - * - * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler), - * and we cannot use all since we still need some for other storage */ - - switch(id) { - case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; - case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; - case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; - case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; - case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; - case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; - case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; - case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; - case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; - case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; - case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; - case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } -# else - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - /* float4, byte4 and half4 */ - if(id < TEX_START_FLOAT_CUDA_KEPLER) - r = kernel_tex_image_interp_float4(tex, x, y); - /* float, byte and half */ - else { - float f = kernel_tex_image_interp_float(tex, x, y); - r = make_float4(f, f, f, 1.0f); - } -# endif -#endif - -#ifdef __KERNEL_SSE2__ - float alpha = r.w; + const float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_ssef = r_ssef / ssef(alpha); - if(id >= TEX_NUM_FLOAT4_IMAGES) - r_ssef = min(r_ssef, ssef(1.0f)); - r.w = alpha; - } - - if(srgb) { - r_ssef = color_srgb_to_scene_linear(r_ssef); - r.w = alpha; - } -#else - if(use_alpha && r.w != 1.0f && r.w != 0.0f) { - float invw = 1.0f/r.w; - r.x *= invw; - r.y *= invw; - r.z *= invw; - - if(id >= TEX_NUM_FLOAT4_IMAGES) { - r.x = min(r.x, 1.0f); - r.y = min(r.y, 1.0f); - r.z = min(r.z, 1.0f); + r /= alpha; + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_BYTE) + { + r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f)); } + r.w = alpha; } if(srgb) { - r.x = color_srgb_to_scene_linear(r.x); - r.y = color_srgb_to_scene_linear(r.y); - r.z = color_srgb_to_scene_linear(r.z); + r = color_srgb_to_scene_linear_v4(r); } -#endif return r; } @@ -238,12 +77,14 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = ccl_fetch(sd, N); + float3 N = sd->N; - N = ccl_fetch(sd, N); + N = sd->N; object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ + float3 signed_N = N; + N.x = fabsf(N.x); N.y = fabsf(N.y); N.z = fabsf(N.z); @@ -313,12 +154,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f); uint use_alpha = stack_valid(alpha_offset); - if(weight.x > 0.0f) - f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha); - if(weight.y > 0.0f) - f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha); - if(weight.z > 0.0f) - f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha); + /* Map so that no textures are flipped, rotation is somewhat arbitrary. */ + if(weight.x > 0.0f) { + float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z); + f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.y > 0.0f) { + float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z); + f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.z > 0.0f) { + float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x); + f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } if(stack_valid(out_offset)) stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z)); @@ -337,8 +185,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index f35ea05048b..1492e358608 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,9 +31,11 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = sd->ray_length; break; case NODE_LP_ray_depth: info = (float)state->bounce; break; + case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break; + case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break; case NODE_LP_ray_transparent: info = (float)state->transparent_bounce; break; case NODE_LP_ray_transmission: info = (float)state->transmission_bounce; break; } @@ -54,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); + float squared = sd->ray_length*sd->ray_length; /* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */ if(isfinite(squared)) { strength *= squared/(smooth + squared); diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h index 01547b60014..1ce7777aac3 100644 --- a/intern/cycles/kernel/svm/svm_math_util.h +++ b/intern/cycles/kernel/svm/svm_math_util.h @@ -100,66 +100,64 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2) return Fac; } -ccl_device float3 svm_math_blackbody_color(float t) { - /* Calculate color in range 800..12000 using an approximation - * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B - * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), - * which is enough to get the same 8 bit/channel color. - */ - - const float rc[6][3] = { - { 2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f }, - { 3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f }, - { 4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f }, - { 4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f }, - { 4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f }, - { 3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f }, - }; - - const float gc[6][3] = { - { -7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f }, - { -1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f }, - { -1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f }, - { -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f }, - { -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f }, - { -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f }, - }; - - const float bc[6][4] = { - { 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */ - { 0.0f, 0.0f, 0.0f, 0.0f }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - { -2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f }, - { -2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f }, - { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, - }; - - if(t >= 12000.0f) +/* Calculate color in range 800..12000 using an approximation + * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B + * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), + * which is enough to get the same 8 bit/channel color. + */ + +ccl_static_constant float blackbody_table_r[6][3] = { + { 2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f }, + { 3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f }, + { 4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f }, + { 4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f }, + { 4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f }, + { 3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f }, +}; + +ccl_static_constant float blackbody_table_g[6][3] = { + { -7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f }, + { -1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f }, + { -1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f }, + { -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f }, + { -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f }, + { -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f }, +}; + +ccl_static_constant float blackbody_table_b[6][4] = { + { 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */ + { 0.0f, 0.0f, 0.0f, 0.0f }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + { -2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f }, + { -2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f }, + { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, +}; + + +ccl_device float3 svm_math_blackbody_color(float t) +{ + if(t >= 12000.0f) { return make_float3(0.826270103f, 0.994478524f, 1.56626022f); + } + else if(t < 965.0f) { + /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ + return make_float3(4.70366907f, 0.0f, 0.0f); + } + + int i = (t >= 6365.0f)? 5: + (t >= 3315.0f)? 4: + (t >= 1902.0f)? 3: + (t >= 1449.0f)? 2: + (t >= 1167.0f)? 1: 0; + + ccl_constant float *r = blackbody_table_r[i]; + ccl_constant float *g = blackbody_table_g[i]; + ccl_constant float *b = blackbody_table_b[i]; - /* Define a macro to reduce stack usage for nvcc */ -#define MAKE_BB_RGB(i) make_float3(\ - rc[i][0] / t + rc[i][1] * t + rc[i][2],\ - gc[i][0] / t + gc[i][1] * t + gc[i][2],\ - ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]) - - if(t >= 6365.0f) - return MAKE_BB_RGB(5); - if(t >= 3315.0f) - return MAKE_BB_RGB(4); - if(t >= 1902.0f) - return MAKE_BB_RGB(3); - if(t >= 1449.0f) - return MAKE_BB_RGB(2); - if(t >= 1167.0f) - return MAKE_BB_RGB(1); - if(t >= 965.0f) - return MAKE_BB_RGB(0); - -#undef MAKE_BB_RGB - - /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ - return make_float3(4.70366907f, 0.0f, 0.0f); + const float t_inv = 1.0f / t; + return make_float3(r[0] * t_inv + r[1] * t + r[2], + g[0] * t_inv + g[1] * t + g[2], + ((b[0] * t + b[1]) * t + b[2]) * t + b[3]); } ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma) diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h index 62ff38cf1c5..0347ab7b193 100644 --- a/intern/cycles/kernel/svm/svm_noisetex.h +++ b/intern/cycles/kernel/svm/svm_noisetex.h @@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN /* Noise */ -ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color) -{ - int hard = 0; - - if(distortion != 0.0f) { - float3 r, offset = make_float3(13.5f, 13.5f, 13.5f); - - r.x = noise(p + offset) * distortion; - r.y = noise(p) * distortion; - r.z = noise(p - offset) * distortion; - - p += r; - } - - *fac = noise_turbulence(p, detail, hard); - *color = make_float3(*fac, - noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard), - noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard)); -} - ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) { uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset; decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset); + decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL); uint4 node2 = read_node(kg, offset); float scale = stack_load_float_default(stack, scale_offset, node2.x); float detail = stack_load_float_default(stack, detail_offset, node2.y); float distortion = stack_load_float_default(stack, distortion_offset, node2.z); - float3 co = stack_load_float3(stack, co_offset); + float3 p = stack_load_float3(stack, co_offset) * scale; + int hard = 0; - float3 color; - float f; + if(distortion != 0.0f) { + float3 r, offset = make_float3(13.5f, 13.5f, 13.5f); + + r.x = noise(p + offset) * distortion; + r.y = noise(p) * distortion; + r.z = noise(p - offset) * distortion; - svm_noise(co*scale, detail, distortion, &f, &color); + p += r; + } - decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL); + float f = noise_turbulence(p, detail, hard); - if(stack_valid(fac_offset)) + if(stack_valid(fac_offset)) { stack_store_float(stack, fac_offset, f); - if(stack_valid(color_offset)) + } + if(stack_valid(color_offset)) { + float3 color = make_float3(f, + noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard), + noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard)); stack_store_float3(stack, color_offset, color); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index c0b01262212..c94327401f5 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P); + data = sd->P; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P)); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P); else - data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); + data = transform_point(&tfm, sd->P + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); + data = camera_world_to_ndc(kg, sd, sd->P); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P); + data = sd->P; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dx); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dy); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float3 color = stack_load_float3(stack, color_offset); color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f); - bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0; + bool is_backfacing = (sd->flag & SD_BACKFACING) != 0; float3 N; if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(ccl_fetch(sd, object) == OBJECT_NONE) { + if(sd->object == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL); float3 normal; - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL); } else { - normal = ccl_fetch(sd, Ng); + normal = sd->Ng; /* the normal is already inverted, which is too soon for the math here */ if(is_backfacing) { @@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); + N = safe_normalize(sd->N + (N - sd->N)*strength); } if(is_zero(N)) { - N = ccl_fetch(sd, N); + N = sd->N; } stack_store_float3(stack, normal_offset, N); @@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(desc.offset == ATTR_STD_NOT_FOUND) - generated = ccl_fetch(sd, P); + generated = sd->P; else generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL); @@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); + tangent = cross(sd->N, normalize(cross(tangent, sd->N))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 5adf7d34f7f..d859cae1708 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -188,6 +188,8 @@ typedef enum NodeLightPath { NODE_LP_backfacing, NODE_LP_ray_length, NODE_LP_ray_depth, + NODE_LP_ray_diffuse, + NODE_LP_ray_glossy, NODE_LP_ray_transparent, NODE_LP_ray_transmission, } NodeLightPath; @@ -395,17 +397,23 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, + CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID, + CLOSURE_BSDF_PRINCIPLED_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, + CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, + CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, @@ -414,24 +422,26 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, /* Special cases */ CLOSURE_BSDF_BSSRDF_ID, + CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID, CLOSURE_BSDF_TRANSPARENT_ID, /* BSSRDF */ CLOSURE_BSSRDF_CUBIC_ID, CLOSURE_BSSRDF_GAUSSIAN_ID, + CLOSURE_BSSRDF_PRINCIPLED_ID, CLOSURE_BSSRDF_BURLEY_ID, /* Other */ @@ -445,19 +455,24 @@ typedef enum ClosureType { CLOSURE_VOLUME_ABSORPTION_ID, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, + CLOSURE_BSDF_PRINCIPLED_ID, + NBUILTIN_CLOSURES } ClosureType; /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) -#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ - type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) + type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) @@ -466,7 +481,8 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 4c32130d06d..4e92f27acdb 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); + bool is_object = (sd->object != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index a8b3604a8a7..d967516a5c9 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -42,24 +42,8 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, tfm.w = read_node_float(kg, offset); co = transform_point(&tfm, co); } - float4 r; -# if defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - if(id < 2048) /* TODO(dingto): Make this a variable */ - r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); - else { - float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); - r = make_float4(f, f, f, 1.0f); - } -# else /* __CUDA_ARCH__ >= 300 */ - r = volume_image_texture_3d(id, co.x, co.y, co.z); -# endif -# elif defined(__KERNEL_OPENCL__) - r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z); -# else - r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); -# endif /* __KERNEL_CUDA__ */ + + float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE); #else float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); #endif diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h index 57030f3979d..855b356b397 100644 --- a/intern/cycles/kernel/svm/svm_wavelength.h +++ b/intern/cycles/kernel/svm/svm_wavelength.h @@ -34,44 +34,44 @@ CCL_NAMESPACE_BEGIN /* Wavelength to RGB */ +// CIE colour matching functions xBar, yBar, and zBar for +// wavelengths from 380 through 780 nanometers, every 5 +// nanometers. For a wavelength lambda in this range: +// cie_colour_match[(lambda - 380) / 5][0] = xBar +// cie_colour_match[(lambda - 380) / 5][1] = yBar +// cie_colour_match[(lambda - 380) / 5][2] = zBar +ccl_static_constant float cie_colour_match[81][3] = { + {0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f}, + {0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f}, + {0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f}, + {0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f}, + {0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f}, + {0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f}, + {0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f}, + {0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f}, + {0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f}, + {0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f}, + {0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f}, + {0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f}, + {0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f}, + {0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f}, + {1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f}, + {1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f}, + {0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f}, + {0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f}, + {0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f}, + {0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f}, + {0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f}, + {0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f}, + {0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f}, + {0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f}, + {0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f}, + {0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f}, + {0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f} +}; + ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out) { - // CIE colour matching functions xBar, yBar, and zBar for - // wavelengths from 380 through 780 nanometers, every 5 - // nanometers. For a wavelength lambda in this range: - // cie_colour_match[(lambda - 380) / 5][0] = xBar - // cie_colour_match[(lambda - 380) / 5][1] = yBar - // cie_colour_match[(lambda - 380) / 5][2] = zBar - const float cie_colour_match[81][3] = { - {0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f}, - {0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f}, - {0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f}, - {0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f}, - {0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f}, - {0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f}, - {0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f}, - {0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f}, - {0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f}, - {0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f}, - {0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f}, - {0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f}, - {0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f}, - {0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f}, - {1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f}, - {1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f}, - {0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f}, - {0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f}, - {0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f}, - {0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f}, - {0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f}, - {0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f}, - {0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f}, - {0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f}, - {0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f}, - {0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f}, - {0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f} - }; - float lambda_nm = stack_load_float(stack, wavelength); float ii = (lambda_nm-380.0f) * (1.0f/5.0f); // scaled 0..80 int i = float_to_int(ii); @@ -82,7 +82,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt } else { ii -= i; - const float *c = cie_colour_match[i]; + ccl_constant float *c = cie_colour_match[i]; color = interp(make_float3(c[0], c[1], c[2]), make_float3(c[3], c[4], c[5]), ii); } @@ -92,8 +92,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt /* Clamp to zero if values are smaller */ color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - if(stack_valid(color_out)) - stack_store_float3(stack, color_out, color); + stack_store_float3(stack, color_out, color); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 6eed9bc1a99..3c6353c8001 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) + if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) #else - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, ccl_fetch(sd, prim), Co); + if(sd->type & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, sd->prim, Co); else - motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); + motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); - if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); - float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); + float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, * With OpenCL 2.0 it's possible to avoid this change, but for until * then we'll be living with such an exception. */ - float3 P = ccl_fetch(sd, P); + float3 P = sd->P; float f = wireframe(kg, sd, size, pixel_size, &P); #else - float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); + float f = wireframe(kg, sd, size, pixel_size, &sd->P); #endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); + float3 Px = sd->P - sd->dP.dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); } else if(bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); + float3 Py = sd->P - sd->dP.dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); } if(stack_valid(out_fac)) |