diff options
Diffstat (limited to 'intern/cycles/kernel')
138 files changed, 7356 insertions, 2554 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a25eb3f5b50..18d1360542c 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -1,3 +1,4 @@ +remove_extra_strict_flags() set(INC . @@ -11,9 +12,20 @@ set(INC_SYS ) set(SRC - kernel.cpp - kernel.cl - kernel.cu + kernels/cpu/kernel.cpp + kernels/opencl/kernel.cl + kernels/opencl/kernel_data_init.cl + kernels/opencl/kernel_queue_enqueue.cl + kernels/opencl/kernel_scene_intersect.cl + kernels/opencl/kernel_lamp_emission.cl + kernels/opencl/kernel_background_buffer_update.cl + kernels/opencl/kernel_shader_eval.cl + kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl + kernels/opencl/kernel_direct_lighting.cl + kernels/opencl/kernel_shadow_blocked.cl + kernels/opencl/kernel_next_iteration_setup.cl + kernels/opencl/kernel_sum_all_radiance.cl + kernels/cuda/kernel.cu ) set(SRC_HEADERS @@ -35,17 +47,22 @@ set(SRC_HEADERS kernel_montecarlo.h kernel_passes.h kernel_path.h + kernel_path_branched.h + kernel_path_common.h kernel_path_state.h kernel_path_surface.h kernel_path_volume.h kernel_projection.h + kernel_queues.h kernel_random.h kernel_shader.h + kernel_shaderdata_vars.h kernel_shadow.h kernel_subsurface.h kernel_textures.h kernel_types.h kernel_volume.h + kernel_work_stealing.h ) set(SRC_CLOSURE_HEADERS @@ -67,6 +84,7 @@ set(SRC_CLOSURE_HEADERS closure/emissive.h closure/volume.h ) + set(SRC_SVM_HEADERS svm/svm.h svm/svm_attribute.h @@ -118,6 +136,7 @@ set(SRC_GEOM_HEADERS geom/geom_bvh_subsurface.h geom/geom_bvh_traversal.h geom/geom_bvh_volume.h + geom/geom_bvh_volume_all.h geom/geom_curve.h geom/geom_motion_curve.h geom/geom_motion_triangle.h @@ -128,12 +147,14 @@ set(SRC_GEOM_HEADERS geom/geom_qbvh_subsurface.h geom/geom_qbvh_traversal.h geom/geom_qbvh_volume.h + geom/geom_qbvh_volume_all.h geom/geom_triangle.h geom/geom_triangle_intersect.h geom/geom_volume.h ) set(SRC_UTIL_HEADERS + ../util/util_atomic.h ../util/util_color.h ../util/util_half.h ../util/util_math.h @@ -141,6 +162,21 @@ set(SRC_UTIL_HEADERS ../util/util_transform.h ../util/util_types.h ) + +set(SRC_SPLIT_HEADERS + split/kernel_background_buffer_update.h + split/kernel_data_init.h + split/kernel_direct_lighting.h + split/kernel_holdout_emission_blurring_pathtermination_ao.h + split/kernel_lamp_emission.h + split/kernel_next_iteration_setup.h + split/kernel_scene_intersect.h + split/kernel_shader_eval.h + split/kernel_shadow_blocked.h + split/kernel_split_common.h + split/kernel_sum_all_radiance.h +) + # CUDA module if(WITH_CYCLES_CUDA_BINARIES) @@ -166,12 +202,12 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) + set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) set(cuda_cubins) macro(CYCLES_CUDA_KERNEL_ADD arch experimental) if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__") + set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") set(cuda_cubin kernel_experimental_${arch}.cubin) else() set(cuda_extra_flags "") @@ -192,7 +228,7 @@ if(WITH_CYCLES_CUDA_BINARIES) COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu + --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} @@ -240,28 +276,28 @@ include_directories(SYSTEM ${INC_SYS}) if(CXX_HAS_SSE) list(APPEND SRC - kernel_sse2.cpp - kernel_sse3.cpp - kernel_sse41.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp ) - set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") - set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) list(APPEND SRC - kernel_avx.cpp + kernels/cpu/kernel_avx.cpp ) - set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) list(APPEND SRC - kernel_avx2.cpp + kernels/cpu/kernel_avx2.cpp ) - set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS}) @@ -280,11 +316,23 @@ endif() #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split) diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index c0d969e24ae..e8d51013924 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -57,8 +57,9 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: build_dir = os.path.join(root_build_dir, 'intern/cycles/kernel') # source directories and files + kernel_file_rel = os.path.join("kernels", "cuda", "kernel.cu") source_dir = Dir('.').srcnode().path - kernel_file = os.path.join(source_dir, "kernel.cu") + kernel_file = os.path.join(source_dir, kernel_file_rel) util_dir = os.path.join(source_dir, "../util") svm_dir = os.path.join(source_dir, "../svm") geom_dir = os.path.join(source_dir, "../geom") @@ -83,11 +84,11 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: nvcc_flags += " -D__KERNEL_DEBUG__" # dependencies - dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') + dependencies = [kernel_file_rel] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') last_cubin_file = None configs = (("kernel_%s.cubin", ''), - ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__')) + ("kernel_experimental_%s.cubin", ' -D__KERNEL_EXPERIMENTAL__')) # add command for each cuda architecture for arch in cuda_archs: @@ -105,7 +106,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: else: command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file) - kernel.Command(cubin_file, 'kernel.cu', command) + kernel.Command(cubin_file, kernel_file_rel, command) kernel.Depends(cubin_file, dependencies) kernel_binaries.append(cubin_file) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 2b9e2a4e44d..558aa0dc6a9 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; /*case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break;*/ case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf); #endif - if(dot(sd->Ng, omega_in) >= 0.0f) { + if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; /*case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break;*/ case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif default: @@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif default: diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index acc477246d2..8d7d533d6f8 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -69,6 +69,9 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c float out = 0.0f; + if(fmaxf(sc->data0, sc->data1) <= 1e-4f) + return make_float3(0.0f, 0.0f, 0.0f); + if(NdotI > 0.0f && NdotO > 0.0f) { NdotI = fmaxf(NdotI, 1e-6f); NdotO = fmaxf(NdotO, 1e-6f); @@ -190,8 +193,15 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, /* reflect I on H to get omega_in */ *omega_in = -I + (2.0f * HdotI) * H; - /* leave the rest to eval_reflect */ - *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf); + if(fmaxf(sc->data0, sc->data1) <= 1e-4f) { + /* Some high number for MIS. */ + *pdf = 1e6f; + *eval = make_float3(1e6f, 1e6f, 1e6f); + } + else { + /* leave the rest to eval_reflect */ + *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf); + } #ifdef __RAY_DIFFERENTIALS__ /* just do the reflection thing for now */ diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 580f50d5dd6..f1a26650078 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -59,7 +59,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co float cosHO = fabsf(dot(I, H)); if(!(fabsf(cosNH) < 1.0f-1e-5f && cosHO > 1e-5f)) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); float cosNHdivHO = cosNH / cosHO; cosNHdivHO = fmaxf(cosNHdivHO, 1e-5f); @@ -80,7 +80,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co return make_float3(out, out, out); } - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); } ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) @@ -114,7 +114,7 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng, float sinNH2 = 1 - cosNH * cosNH; float sinNH4 = sinNH2 * sinNH2; - float cotangent2 = (cosNH * cosNH) / sinNH2; + float cotangent2 = (cosNH * cosNH) / sinNH2; float D = expf(-cotangent2 * m_invsigma2) * m_invsigma2 * M_1_PI_F / sinNH4; float G = min(1.0f, min(fac1, fac2)); // TODO: derive G from D analytically diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index cdaf84f1750..e0287e7655a 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -52,6 +52,8 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc) { sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID; + sc->data0 = 0.0f; + sc->data1 = 0.0f; return SD_BSDF|SD_BSDF_HAS_EVAL; } diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 71086f2e764..6a50bbed3b3 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -35,79 +35,7 @@ CCL_NAMESPACE_BEGIN -/* Approximate erf and erfinv implementations. - * Implementation comes straight from Wikipedia: - * - * http://en.wikipedia.org/wiki/Error_function - * - * Some constants are baked into the code. - */ - -ccl_device_inline float approx_erff_do(float x) -{ - /* Such a clamp doesn't give much distortion to the output value - * and gives quite a few of the speedup. - */ - if(x > 3.0f) { - return 1.0f; - } - float t = 1.0f / (1.0f + 0.47047f*x); - return (1.0f - - t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x)); -} - -ccl_device_inline float approx_erff(float x) -{ - if(x >= 0.0f) { - return approx_erff_do(x); - } - else { - return -approx_erff_do(-x); - } -} - -ccl_device_inline float approx_erfinvf_do(float x) -{ - if(x <= 0.7f) { - const float x2 = x * x; - const float a1 = 0.886226899f; - const float a2 = -1.645349621f; - const float a3 = 0.914624893f; - const float a4 = -0.140543331f; - const float b1 = -2.118377725f; - const float b2 = 1.442710462f; - const float b3 = -0.329097515f; - const float b4 = 0.012229801f; - return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) / - ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f); - } - else { - const float c1 = -1.970840454f; - const float c2 = -1.624906493f; - const float c3 = 3.429567803f; - const float c4 = 1.641345311f; - const float d1 = 3.543889200f; - const float d2 = 1.637067800f; - const float z = sqrtf(-logf((1.0f - x) * 0.5f)); - return (((c4 * z + c3) * z + c2) * z + c1) / - ((d2 * z + d1) * z + 1.0f); - } -} - -ccl_device_inline float approx_erfinvf(float x) -{ - if(x >= 0.0f) { - return approx_erfinvf_do(x); - } - else { - return -approx_erfinvf_do(-x); - } -} - -/* Beckmann and GGX microfacet importance sampling from: - * - * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals. - * E. Heitz and E. d'Eon, EGSR 2014 */ +/* Beckmann and GGX microfacet importance sampling. */ ccl_device_inline void microfacet_beckmann_sample_slopes( KernelGlobals *kg, @@ -128,64 +56,71 @@ ccl_device_inline void microfacet_beckmann_sample_slopes( /* precomputations */ const float tan_theta_i = sin_theta_i/cos_theta_i; const float inv_a = tan_theta_i; - const float a = 1.0f/inv_a; - const float erf_a = approx_erff(a); - const float exp_a2 = expf(-a*a); + const float cot_theta_i = 1.0f/tan_theta_i; + const float erf_a = fast_erff(cot_theta_i); + const float exp_a2 = expf(-cot_theta_i*cot_theta_i); const float SQRT_PI_INV = 0.56418958354f; const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a); const float G1 = 1.0f/(1.0f + Lambda); /* masking */ *G1i = G1; -#if 0 - const float C = 1.0f - G1 * erf_a; - - /* sample slope X */ - if(randu < C) { - /* rescale randu */ - randu = randu / C; - const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2; - const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a); - const float p = w_1 / (w_1 + w_2); +#if defined(__KERNEL_GPU__) + /* Based on paper from Wenzel Jakob + * An Improved Visible Normal Sampling Routine for the Beckmann Distribution + * + * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf + * + * Reformulation from OpenShadingLanguage which avoids using inverse + * trigonometric functions. + */ - if(randu < p) { - randu = randu / p; - *slope_x = -sqrtf(-logf(randu*exp_a2)); - } - else { - randu = (randu - p) / (1.0f - p); - *slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a); - } + /* Sample slope X. + * + * Compute a coarse approximation using the approximation: + * exp(-ierf(x)^2) ~= 1 - x * x + * solve y = 1 + b + K * (1 - b * b) + */ + float K = tan_theta_i * SQRT_PI_INV; + float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a)); + float y_exact = randu * (1.0f + erf_a + K * exp_a2); + float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f; + + /* Perform newton step to refine toward the true root. */ + float inv_erf = fast_ierff(b); + float value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact; + /* Check if we are close enough already, + * this also avoids NaNs as we get close to the root. + */ + if(fabsf(value) > 1e-6f) { + b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */ + inv_erf = fast_ierff(b); + value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact; + b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */ + /* Compute the slope from the refined value. */ + *slope_x = fast_ierff(b); } else { - /* rescale randu */ - randu = (randu - C) / (1.0f - C); - *slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a); - - const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i); - - if(randv > p) { - *slope_x = -(*slope_x); - randv = (randv - p) / (1.0f - p); - } - else - randv = randv / p; + /* We are close enough already. */ + *slope_x = inv_erf; } - - /* sample slope Y */ - *slope_y = approx_erfinvf(2.0f*randv - 1.0f); + *slope_y = fast_ierff(2.0f*randv - 1.0f); #else - /* use precomputed table, because it better preserves stratification - * of the random number pattern */ + /* Use precomputed table on CPU, it gives better perfomance. */ int beckmann_table_offset = kernel_data.tables.beckmann_offset; *slope_x = lookup_table_read_2D(kg, randu, cos_theta_i, beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE); - *slope_y = approx_erfinvf(2.0f*randv - 1.0f); + *slope_y = fast_ierff(2.0f*randv - 1.0f); #endif - } +/* GGX microfacet importance sampling from: + * + * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals. + * E. Heitz and E. d'Eon, EGSR 2014 + */ + ccl_device_inline void microfacet_ggx_sample_slopes( const float cos_theta_i, const float sin_theta_i, float randu, float randv, float *slope_x, float *slope_y, @@ -300,7 +235,7 @@ ccl_device_inline float3 microfacet_sample_stretched( ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data0 = saturate(sc->data0); /* alpha_x */ sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID; @@ -310,8 +245,8 @@ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc) ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ - sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + sc->data0 = saturate(sc->data0); /* alpha_x */ + sc->data1 = saturate(sc->data1); /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; @@ -320,7 +255,7 @@ ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc) ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data0 = saturate(sc->data0); /* alpha_x */ sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; @@ -342,7 +277,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float3 N = sc->N; if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); float cosNO = dot(N, I); float cosNI = dot(N, omega_in); @@ -421,7 +356,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons return make_float3(out, out, out); } - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); } ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) @@ -433,13 +368,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con float3 N = sc->N; if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); float cosNO = dot(N, I); float cosNI = dot(N, omega_in); if(cosNO <= 0 || cosNI >= 0) - return make_float3(0, 0, 0); /* vectors on same side -- not possible */ + return make_float3(0.0f, 0.0f, 0.0f); /* vectors on same side -- not possible */ /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); @@ -653,7 +588,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data0 = saturate(sc->data0); /* alpha_x */ sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID; @@ -662,8 +597,8 @@ ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc) ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ - sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + sc->data0 = saturate(sc->data0); /* alpha_x */ + sc->data1 = saturate(sc->data1); /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; @@ -671,7 +606,7 @@ ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc) ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data0 = saturate(sc->data0); /* alpha_x */ sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; @@ -692,7 +627,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, float3 N = sc->N; if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); float cosNO = dot(N, I); float cosNI = dot(N, omega_in); @@ -774,7 +709,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, return make_float3(out, out, out); } - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); } ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) @@ -786,13 +721,13 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc float3 N = sc->N; if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); float cosNO = dot(N, I); float cosNI = dot(N, omega_in); if(cosNO <= 0 || cosNI >= 0) - return make_float3(0, 0, 0); + return make_float3(0.0f, 0.0f, 0.0f); /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index c476d4ca4e2..61b7cb11b02 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -37,7 +37,7 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc) sc->type = CLOSURE_BSDF_OREN_NAYAR_ID; - sigma = clamp(sigma, 0.0f, 1.0f); + sigma = saturate(sigma); float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) * sigma); diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index f9f263719e9..1ab15eee954 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -51,9 +51,9 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc) { - sc->data0 = max(sc->data0, 0.0f); - sc->type = CLOSURE_BSDF_PHONG_RAMP_ID; + sc->data0 = max(sc->data0, 0.0f); + sc->data1 = 0.0f; return SD_BSDF|SD_BSDF_HAS_EVAL; } diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index df03942638f..e5b6ab93a64 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -40,8 +40,8 @@ CCL_NAMESPACE_BEGIN ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc) { sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID; - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); - sc->data1 = clamp(sc->data1, 0.0f, 1.0f); + sc->data0 = saturate(sc->data0); + sc->data1 = saturate(sc->data1); return SD_BSDF|SD_BSDF_HAS_EVAL; } @@ -120,8 +120,8 @@ ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, floa ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc) { sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID; - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); - sc->data1 = clamp(sc->data1, 0.0f, 1.0f); + sc->data0 = saturate(sc->data0); + sc->data1 = saturate(sc->data1); return SD_BSDF|SD_BSDF_HAS_EVAL; } diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index b6de2da8c71..f817dcd5f2d 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -30,8 +30,8 @@ ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type) return flag; } else { - sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* texture blur */ - sc->T.x = clamp(sc->T.x, 0.0f, 1.0f); /* sharpness */ + sc->data1 = saturate(sc->data1); /* texture blur */ + sc->T.x = saturate(sc->T.x); /* sharpness */ sc->type = type; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF; @@ -157,7 +157,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi) float x = 0.25f; int i; - for (i = 0; i < max_iteration_count; i++) { + for(i = 0; i < max_iteration_count; i++) { float x2 = x*x; float x3 = x2*x; float nx = (1.0f - x); @@ -168,7 +168,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi) if(fabsf(f) < tolerance || f_ == 0.0f) break; - x = clamp(x - f/f_, 0.0f, 1.0f); + x = saturate(x - f/f_); } return x; diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h index 439610546e5..4d71ba50ec3 100644 --- a/intern/cycles/kernel/closure/volume.h +++ b/intern/cycles/kernel/closure/volume.h @@ -107,18 +107,9 @@ ccl_device int volume_absorption_setup(ShaderClosure *sc) ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf) { - float3 eval; + kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID); - switch(sc->type) { - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); - break; - default: - eval = make_float3(0.0f, 0.0f, 0.0f); - break; - } - - return eval; + return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); } ccl_device int volume_phase_sample(const ShaderData *sd, const ShaderClosure *sc, float randu, diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index bf0d86e6206..5ab900d47aa 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -22,7 +22,9 @@ #define BVH_STACK_SIZE 192 #define BVH_QSTACK_SIZE 384 #define BVH_NODE_SIZE 4 +#define BVH_NODE_LEAF_SIZE 1 #define BVH_QNODE_SIZE 7 +#define BVH_QNODE_LEAF_SIZE 1 #define TRI_NODE_SIZE 3 /* silly workaround for float extended precision that happens when compiling diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 9ac16e86085..c7364e9edac 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem) { - if(sd->object == PRIM_NONE) + if(ccl_fetch(sd, object) == PRIM_NONE) return (int)ATTR_STD_NOT_FOUND; /* for SVM, find attribute by unique id */ - uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; + uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; #ifdef __HAIR__ - attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; + attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; #endif uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui *elem = (AttributeElement)attr_map.y; - if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH) + if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH) return ATTR_STD_NOT_FOUND; /* return result */ diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index c0eefcd9c7f..3d0d406dd0b 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -115,7 +115,39 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_subsurface.h" #endif -/* Record all BVH intersection for shadows */ +/* Volume BVH traversal */ + +#if defined(__VOLUME__) +#define BVH_FUNCTION_NAME bvh_intersect_volume +#define BVH_FUNCTION_FEATURES 0 +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing +#define BVH_FUNCTION_FEATURES BVH_INSTANCING +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION +#include "geom_bvh_volume.h" +#endif + +/* Record all intersections - Shadow BVH traversal */ #if defined(__SHADOW_RECORD_ALL__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all @@ -147,36 +179,36 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_shadow.h" #endif -/* Camera inside Volume BVH intersection */ +/* Record all intersections - Volume BVH traversal */ -#if defined(__VOLUME__) -#define BVH_FUNCTION_NAME bvh_intersect_volume +#if defined(__VOLUME_RECORD_ALL__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_all #define BVH_FUNCTION_FEATURES 0 -#include "geom_bvh_volume.h" +#include "geom_bvh_volume_all.h" #endif -#if defined(__VOLUME__) && defined(__INSTANCING__) -#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing +#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing #define BVH_FUNCTION_FEATURES BVH_INSTANCING -#include "geom_bvh_volume.h" +#include "geom_bvh_volume_all.h" #endif -#if defined(__VOLUME__) && defined(__HAIR__) -#define BVH_FUNCTION_NAME bvh_intersect_volume_hair +#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -#include "geom_bvh_volume.h" +#include "geom_bvh_volume_all.h" #endif -#if defined(__VOLUME__) && defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_volume_motion +#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -#include "geom_bvh_volume.h" +#include "geom_bvh_volume_all.h" #endif -#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion +#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -#include "geom_bvh_volume.h" +#include "geom_bvh_volume_all.h" #endif #undef BVH_FEATURE @@ -330,6 +362,37 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, } #endif +#ifdef __VOLUME_RECORD_ALL__ +ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint max_hits) +{ +#ifdef __OBJECT_MOTION__ + if(kernel_data.bvh.have_motion) { +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_all_hair_motion(kg, ray, isect, max_hits); +#endif /* __HAIR__ */ + + return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits); + } +#endif /* __OBJECT_MOTION__ */ + +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_all_hair(kg, ray, isect, max_hits); +#endif /* __HAIR__ */ + +#ifdef __INSTANCING__ + if(kernel_data.bvh.have_instancing) + return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits); +#endif /* __INSTANCING__ */ + + return bvh_intersect_volume_all(kg, ray, isect, max_hits); +} +#endif + /* Ray offset to avoid self intersection. * @@ -384,5 +447,21 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } +#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__) +/* ToDo: Move to another file? */ +ccl_device int intersections_compare(const void *a, const void *b) +{ + const Intersection *isect_a = (const Intersection*)a; + const Intersection *isect_b = (const Intersection*)b; + + if(isect_a->t < isect_b->t) + return -1; + else if(isect_a->t > isect_b->t) + return 1; + else + return 0; +} +#endif + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h index 193f49074a3..e4cba99dc96 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h @@ -200,7 +200,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* if node is leaf, fetch triangle list */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -226,7 +226,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); break; } #if BVH_FEATURE(BVH_MOTION) @@ -264,7 +264,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif { - shader = kernel_tex_fetch(__tri_shader, prim); + shader = kernel_tex_fetch(__tri_shader, prim); } #ifdef __HAIR__ else { diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index 290297ef5c5..a73139f9c88 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -187,7 +187,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* if node is leaf, fetch triangle list */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -210,7 +210,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; if(tri_object != subsurface_object) continue; - triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); } break; } diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index 0298e687de2..73d79fd78ee 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -76,6 +76,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_DEBUG__) isect->num_traversal_steps = 0; + isect->num_traversed_instances = 0; #endif #if defined(__KERNEL_SSE2__) @@ -248,7 +249,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* if node is leaf, fetch triangle list */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -269,7 +270,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect->num_traversal_steps++; #endif kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr)) { + if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) @@ -362,6 +363,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; nodeAddr = kernel_tex_fetch(__object_node, object); + +#if defined(__KERNEL_DEBUG__) + isect->num_traversed_instances++; +#endif } } #endif /* FEATURE(BVH_INSTANCING) */ diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h index 0862812a170..41c784869f2 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume.h +++ b/intern/cycles/kernel/geom/geom_bvh_volume.h @@ -188,7 +188,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* if node is leaf, fetch triangle list */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -213,7 +213,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr); + triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); } break; } diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h new file mode 100644 index 00000000000..b6db36f4b17 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h @@ -0,0 +1,454 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __QBVH__ +#include "geom_qbvh_volume_all.h" +#endif + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits) +{ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* ray parameters in registers */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + + const uint visibility = PATH_RAY_ALL_VISIBILITY; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + uint num_hits = 0; + isect_array->t = tmax; + +#if defined(__KERNEL_SSE2__) + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; + + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + bool traverseChild0, traverseChild1; + int nodeAddrChild1; + +#if !defined(__KERNEL_SSE2__) + /* Intersect two child bounding boxes, non-SSE version */ + float t = isect_array->t; + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + /* decide which nodes to traverse next */ + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); + +#else // __KERNEL_SSE2__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + /* decide which nodes to traverse next */ + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); +#endif // __KERNEL_SSE2__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE2__) + bool closestChild1 = (c1min < c0min); +#else + bool closestChild1 = tminmax[1] < tminmax[0]; +#endif + + if(closestChild1) { + int tmp = nodeAddr; + nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; + } + + ++stackPtr; + kernel_assert(stackPtr < BVH_STACK_SIZE); + traversalStack[stackPtr] = nodeAddrChild1; + } + else { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { + /* neither child was intersected */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + const int primAddr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + bool hit; + + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* primitive intersection */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* intersect ray against primitive */ + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +#if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +#else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + /* intersect ray against primitive */ + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +# else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + /* intersect ray against primitive */ + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + else + hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +# else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif + default: { + break; + } + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VOLUME) { + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); +#endif + + triangle_intersect_precalc(dir, &isect_precalc); + num_hits_in_instance = 0; + isect_array->t = isect_t; + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + ++stackPtr; + kernel_assert(stackPtr < BVH_STACK_SIZE); + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + if(num_hits_in_instance) { + float t_fac; +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm); +#else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { + float ignore_t = FLT_MAX; +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + } + + isect_t = tmax; + isect_array->t = isect_t; + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif /* FEATURE(BVH_MOTION) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return num_hits; +} + +ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits) +{ +#ifdef __QBVH__ + if(kernel_data.bvh.use_qbvh) { + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect_array, + max_hits); + } + else +#endif + { + kernel_assert(kernel_data.bvh.use_qbvh == false); + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect_array, + max_hits); + } +} + +#undef BVH_FUNCTION_NAME +#undef BVH_FUNCTION_FEATURES diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index ac6c6ec4929..9653ad8f1bb 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, if(dy) *dy = 0.0f; #endif - return kernel_tex_fetch(__attributes_float, offset + sd->prim); + return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim)); } else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); int k1 = k0 + 1; float f0 = kernel_tex_fetch(__attributes_float, offset + k0); float f1 = kernel_tex_fetch(__attributes_float, offset + k1); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*(f1 - f0); + if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); if(dy) *dy = 0.0f; #endif - return (1.0f - sd->u)*f0 + sd->u*f1; + return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim))); } else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); int k1 = k0 + 1; float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*(f1 - f0); + if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return (1.0f - sd->u)*f0 + sd->u*f1; + return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) { float r = 0.0f; - if(sd->type & PRIMITIVE_ALL_CURVE) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); int k1 = k0 + 1; float4 P_curve[2]; - if(sd->type & PRIMITIVE_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); } - r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; + r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w; } return r*2.0f; @@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); int k1 = k0 + 1; float4 P_curve[2]; @@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u); + return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u)); } /* Curve tangent normal */ @@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) { float3 tgN = make_float3(0.0f,0.0f,0.0f); - if(sd->type & PRIMITIVE_ALL_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); + tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu)))); tgN = normalize(tgN); /* need to find suitable scaled gd for corrected normal */ #if 0 - tgN = normalize(tgN - gd * sd->dPdu); + tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu)); #endif } @@ -442,12 +442,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float r_ext = mw_extension + r_curr; float coverage = 1.0f; - if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { /* the bounding box does not overlap the square centered at O */ tree += level; level = tree & -tree; } - else if (level == 1) { + else if(level == 1) { /* the maximum recursion depth is reached. * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. @@ -459,13 +459,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect if(flags & CURVE_KN_RIBBONS) { float3 tg = (p_en - p_st); float w = tg.x * tg.x + tg.y * tg.y; - if (w == 0) { + if(w == 0) { tree++; level = tree & -tree; continue; } w = -(p_st.x * tg.x + p_st.y * tg.y) / w; - w = clamp((float)w, 0.0f, 1.0f); + w = saturate(w); /* compute u on the curve segment */ u = i_st * (1 - w) + i_en * w; @@ -474,17 +474,17 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st)< 0) + if(dot(tg, dp_st)< 0) dp_st *= -1; - if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { tree++; level = tree & -tree; continue; } float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) + if(dot(tg, dp_en) < 0) dp_en *= -1; - if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { tree++; level = tree & -tree; continue; @@ -500,13 +500,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float d0 = d - r_curr; float d1 = d + r_curr; float inv_mw_extension = 1.0f/mw_extension; - if (d0 >= 0) + if(d0 >= 0) coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; else // inside coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; } - if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { tree++; level = tree & -tree; continue; @@ -548,7 +548,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; float td = tb*tb - 4*cyla*tc; - if (td < 0.0f) { + if(td < 0.0f) { tree++; level = tree & -tree; continue; @@ -559,10 +559,10 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect t = tcentre + correction; float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st)< 0) + if(dot(tg, dp_st)< 0) dp_st *= -1; float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) + if(dot(tg, dp_en) < 0) dp_en *= -1; if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { @@ -570,14 +570,14 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect t = tcentre + correction; } - if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { tree++; level = tree & -tree; continue; } float w = (zcentre + (tg.z * correction)) * invl; - w = clamp((float)w, 0.0f, 1.0f); + w = saturate(w); /* compute u on the curve segment */ u = i_st * (1 - w) + i_en * w; @@ -777,7 +777,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; float td = tb*tb - 4*a*tc; - if (td < 0.0f) + if(td < 0.0f) return false; float rootd = 0.0f; @@ -818,7 +818,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - if (flags & CURVE_KN_ENCLOSEFILTER) { + if(flags & CURVE_KN_ENCLOSEFILTER) { float enc_ratio = 1.01f; if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); @@ -890,7 +890,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; + Transform tfm = ccl_fetch(sd, ob_itfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -903,7 +903,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con int prim = kernel_tex_fetch(__prim_index, isect->prim); float4 v00 = kernel_tex_fetch(__curves, prim); - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); int k1 = k0 + 1; float3 tg; @@ -914,14 +914,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float4 P_curve[4]; - if(sd->type & PRIMITIVE_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { P_curve[0] = kernel_tex_fetch(__curve_keys, ka); P_curve[1] = kernel_tex_fetch(__curve_keys, k0); P_curve[2] = kernel_tex_fetch(__curve_keys, k1); P_curve[3] = kernel_tex_fetch(__curve_keys, kb); } else { - motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve); } float3 p[4]; @@ -933,43 +933,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con P = P + D*t; #ifdef __UV__ - sd->u = isect->u; - sd->v = 0.0f; + ccl_fetch(sd, u) = isect->u; + ccl_fetch(sd, v) = 0.0f; #endif tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D)))); } else { /* direction from inside to surface of curve */ float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); + ccl_fetch(sd, Ng) = normalize(P - p_curr); /* adjustment for changing radius */ float gd = isect->v; if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); + ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; + ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); } } /* todo: sometimes the normal is still so that this is detected as * backfacing even if cull backfaces is enabled */ - sd->N = sd->Ng; + ccl_fetch(sd, N) = ccl_fetch(sd, Ng); } else { float4 P_curve[2]; - if(sd->type & PRIMITIVE_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); } float l = 1.0f; @@ -980,39 +980,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float3 dif = P - float4_to_float3(P_curve[0]); #ifdef __UV__ - sd->u = dot(dif,tg)/l; - sd->v = 0.0f; + ccl_fetch(sd, u) = dot(dif,tg)/l; + ccl_fetch(sd, v) = 0.0f; #endif - if (flag & CURVE_KN_TRUETANGENTGNORMAL) { - sd->Ng = -(D - tg * dot(tg, D)); - sd->Ng = normalize(sd->Ng); + if(flag & CURVE_KN_TRUETANGENTGNORMAL) { + ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D)); + ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); } else { float gd = isect->v; /* direction from inside to surface of curve */ - sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); + ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd); /* adjustment for changing radius */ - if (gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); + if(gd != 0.0f) { + ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; + ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); } } - sd->N = sd->Ng; + ccl_fetch(sd, N) = ccl_fetch(sd, Ng); } #ifdef __DPDU__ /* dPdu/dPdv */ - sd->dPdu = tg; - sd->dPdv = cross(tg, sd->Ng); + ccl_fetch(sd, dPdu) = tg; + ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng)); #endif if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; + Transform tfm = ccl_fetch(sd, ob_tfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index d3297e05c67..86f93f242a1 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -134,7 +134,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s return P; } #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; + Transform tfm = ccl_fetch(sd, ob_itfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -161,7 +161,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; + Transform tfm = ccl_fetch(sd, ob_tfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif @@ -187,7 +187,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh #ifdef __INTERSECTION_REFINE__ if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; + Transform tfm = ccl_fetch(sd, ob_itfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -213,7 +213,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; + Transform tfm = ccl_fetch(sd, ob_tfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif @@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface) { /* get shader */ - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); /* get motion info */ int numsteps, numverts; - object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); + object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); /* figure out which steps we need to fetch and their interpolation factor */ int maxstep = numsteps*2; - int step = min((int)(sd->time*maxstep), maxstep-1); - float t = sd->time*maxstep - step; + int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); + float t = ccl_fetch(sd, time)*maxstep - step; /* find attribute */ AttributeElement elem; - int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); + int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* fetch vertex coordinates */ float3 verts[3], next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim))); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); @@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD #ifdef __SUBSURFACE__ if(!subsurface) #endif - sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); + ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); #ifdef __SUBSURFACE__ else - sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts); + ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts); #endif /* compute face normal */ float3 Ng; - if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); else Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); - sd->Ng = Ng; - sd->N = Ng; + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, N) = Ng; /* compute derivatives of P w.r.t. uv */ #ifdef __DPDU__ - sd->dPdu = (verts[0] - verts[2]); - sd->dPdv = (verts[1] - verts[2]); + ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); + ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); #endif /* compute smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) { + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { /* find attribute */ AttributeElement elem; - int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); + int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* fetch vertex coordinates */ @@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; /* interpolate between vertices */ - float u = sd->u; - float v = sd->v; + float u = ccl_fetch(sd, u); + float v = ccl_fetch(sd, v); float w = 1.0f - u - v; - sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); + ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 79a56683454..9d0a008fff1 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_tfm, *P); + *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_itfm, *P); + *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N)); + *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N)); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N)); + *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N)); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_tfm, *D); + *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_itfm, *D); + *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(sd->object == OBJECT_NONE) + if(ccl_fetch(sd, object) == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); + return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -243,7 +243,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object) ccl_device_inline int object_particle_id(KernelGlobals *kg, int object) { if(object == OBJECT_NONE) - return 0.0f; + return 0; int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; float4 f = kernel_tex_fetch(__objects, offset); @@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1); + return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1); } /* Particle data from which object was instanced */ @@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir) /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t) +ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t) +ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) { if(*t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); @@ -453,7 +453,7 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm) +ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm) { Transform itfm; *tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm); @@ -497,7 +497,7 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object, /* Transorm ray to exit motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm) +ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm) { if(*t != FLT_MAX) *t *= len(transform_direction(tfm, 1.0f/(*idir))); @@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj #endif +/* TODO(sergey): This is only for until we've got OpenCL 2.0 + * on all devices we consider supported. It'll be replaced with + * generic address space. + */ + +#ifdef __KERNEL_OPENCL__ +ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg, + const ShaderData *sd, + ccl_addr_space float3 *D) +{ + float3 private_D = *D; + object_dir_transform(kg, sd, &private_D); + *D = private_D; +} + +ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg, + const ShaderData *sd, + ccl_addr_space float3 *N) +{ + float3 private_N = *N; + object_normal_transform(kg, sd, &private_N); + *N = private_N; +} +#endif + +#ifndef __KERNEL_OPENCL__ +# define object_dir_transform_auto object_dir_transform +# define object_normal_transform_auto object_normal_transform +#else +# define object_dir_transform_auto object_dir_transform_addrspace +# define object_normal_transform_auto object_normal_transform_addrspace +#endif + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index b52ec7ef1b2..30f12d32355 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) { - if(sd->type & PRIMITIVE_ALL_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { return triangle_attribute_float(kg, sd, elem, offset, dx, dy); } #ifdef __HAIR__ - else if(sd->type & PRIMITIVE_ALL_CURVE) { + else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, elem, offset, dx, dy); } #endif #ifdef __VOLUME__ - else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { + else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { return volume_attribute_float(kg, sd, elem, offset, dx, dy); } #endif @@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData * ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) { - if(sd->type & PRIMITIVE_ALL_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { return triangle_attribute_float3(kg, sd, elem, offset, dx, dy); } #ifdef __HAIR__ - else if(sd->type & PRIMITIVE_ALL_CURVE) { + else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, elem, offset, dx, dy); } #endif #ifdef __VOLUME__ - else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { + else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { return volume_attribute_float3(kg, sd, elem, offset, dx, dy); } #endif @@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(sd->type & PRIMITIVE_ALL_CURVE) + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) #ifdef __DPDU__ - return normalize(sd->dPdu); + return normalize(ccl_fetch(sd, dPdu)); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(sd->N, normalize(cross(data, sd->N))); + return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(sd->dPdu); + return normalize(ccl_fetch(sd, dPdu)); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) float3 center; #ifdef __HAIR__ - bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(sd->flag & SD_TRANSFORM_APPLIED)) + if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) object_position_transform(kg, sd, ¢er); } else #endif - center = sd->P; + center = ccl_fetch(sd, P); float3 motion_pre = center, motion_post = center; @@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) if(offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); + object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); /* lookup attributes */ - int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys; + int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys; motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL); motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -184,17 +184,17 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; /* camera motion, for perspective/orthographic motion.pre/post will be a * world-to-raster matrix, for panorama it's world-to-camera */ - if (kernel_data.cam.type != CAMERA_PANORAMA) { + if(kernel_data.cam.type != CAMERA_PANORAMA) { tfm = kernel_data.cam.worldtoraster; motion_center = transform_perspective(&tfm, center); diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h index 4233ff15c86..f79b2ed9f34 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h @@ -155,11 +155,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, ++stackPtr; kernel_assert(stackPtr < BVH_QSTACK_SIZE); traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = c1; + traversalStack[stackPtr].dist = d1; ++stackPtr; kernel_assert(stackPtr < BVH_QSTACK_SIZE); traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = c0; + traversalStack[stackPtr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. @@ -206,7 +206,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* If node is leaf, fetch triangle list. */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); #ifdef __VISIBILITY_FLAG__ if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { /* Pop. */ @@ -241,7 +241,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); break; } #if BVH_FEATURE(BVH_MOTION) @@ -279,7 +279,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif { - shader = kernel_tex_fetch(__tri_shader, prim); + shader = kernel_tex_fetch(__tri_shader, prim); } #ifdef __HAIR__ else { diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h index 62598115fa3..d85e1a4691e 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h @@ -202,7 +202,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* If node is leaf, fetch triangle list. */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -226,7 +226,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(tri_object != subsurface_object) { continue; } - triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); } break; } diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h index 99d2fb20837..7e356ea062b 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h @@ -80,6 +80,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if defined(__KERNEL_DEBUG__) isect->num_traversal_steps = 0; + isect->num_traversed_instances = 0; #endif ssef tnear(0.0f), tfar(ray->t); @@ -185,6 +186,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(traverseChild == 0) { if(d1 < d0) { nodeAddr = c1; + nodeDist = d1; ++stackPtr; kernel_assert(stackPtr < BVH_QSTACK_SIZE); traversalStack[stackPtr].addr = c0; @@ -193,6 +195,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } else { nodeAddr = c0; + nodeDist = d0; ++stackPtr; kernel_assert(stackPtr < BVH_QSTACK_SIZE); traversalStack[stackPtr].addr = c1; @@ -260,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* If node is leaf, fetch triangle list. */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); #ifdef __VISIBILITY_FLAG__ if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) @@ -296,7 +299,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect->num_traversal_steps++; #endif kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr)) { + if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ if(visibility == PATH_RAY_SHADOW_OPAQUE) @@ -377,6 +380,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, traversalStack[stackPtr].dist = -FLT_MAX; nodeAddr = kernel_tex_fetch(__object_node, object); + +#if defined(__KERNEL_DEBUG__) + isect->num_traversed_instances++; +#endif } } #endif /* FEATURE(BVH_INSTANCING) */ diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h index 2c396e99fc4..d8cfa3a4061 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume.h +++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h @@ -95,10 +95,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { /* Traverse internal nodes. */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { -#if defined(__KERNEL_DEBUG__) - isect->num_traversal_steps++; -#endif - ssef dist; int traverseChild = qbvh_node_intersect(kg, tnear, @@ -208,7 +204,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* If node is leaf, fetch triangle list. */ if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); int primAddr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) @@ -234,7 +230,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr); + triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); } break; } diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h new file mode 100644 index 00000000000..d5131919944 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h @@ -0,0 +1,446 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; + traversalStack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + + const uint visibility = PATH_RAY_ALL_VISIBILITY; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + uint num_hits = 0; + isect_array->t = tmax; + + ssef tnear(0.0f), tfar(isect_t); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + ssef dist; + int traverseChild = qbvh_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); + + if(traverseChild != 0) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + + /* One child is hit, continue with that child. */ + int r = __bscf(traverseChild); + if(traverseChild == 0) { + nodeAddr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(traverseChild); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(traverseChild == 0) { + if(d1 < d0) { + nodeAddr = c1; + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c0; + traversalStack[stackPtr].dist = d0; + continue; + } + else { + nodeAddr = c0; + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c1; + traversalStack[stackPtr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c1; + traversalStack[stackPtr].dist = d1; + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c0; + traversalStack[stackPtr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(traverseChild == 0) { + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c2; + traversalStack[stackPtr].dist = d2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2]); + nodeAddr = traversalStack[stackPtr].addr; + --stackPtr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c3; + traversalStack[stackPtr].dist = d3; + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = c2; + traversalStack[stackPtr].dist = d2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &traversalStack[stackPtr - 3]); + } + + nodeAddr = traversalStack[stackPtr].addr; + --stackPtr; + } + + /* If node is leaf, fetch triangle list. */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + bool hit; + + /* Pop. */ + nodeAddr = traversalStack[stackPtr].addr; + --stackPtr; + + /* Primitive intersection. */ + switch(p_type) { + case PRIMITIVE_TRIANGLE: { + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +#if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +#else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +# else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for(; primAddr < primAddr2; primAddr++) { + kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + else + hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir)); +# else + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + float t_fac = len(transform_direction(&tfm, 1.0f/idir)); +#endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VOLUME) { + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + num_hits_in_instance = 0; + isect_array->t = isect_t; + + ++stackPtr; + kernel_assert(stackPtr < BVH_QSTACK_SIZE); + traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr].addr; + --stackPtr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ + if(num_hits_in_instance) { + float t_fac; +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm); +#else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { + float ignore_t = FLT_MAX; +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + } + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + isect_t = tmax; + isect_array->t = isect_t; + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr].addr; + --stackPtr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return num_hits; +} diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index dd3928682e3..995dfac5b09 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); /* return normal */ - if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) return normalize(cross(v2 - v0, v1 - v0)); else return normalize(cross(v1 - v0, v2 - v0)); @@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo /* Ray differentials on triangle */ -ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv) +ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv) { /* fetch triangle vertex coordinates */ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, offset + sd->prim); + return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else if(elem == ATTR_ELEMENT_CORNER) { - int tri = offset + sd->prim*3; + int tri = offset + ccl_fetch(sd, prim)*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else { if(dx) *dx = 0.0f; @@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim))); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) { - int tri = offset + sd->prim*3; + int tri = offset + ccl_fetch(sd, prim)*3; float3 f0, f1, f2; if(elem == ATTR_ELEMENT_CORNER) { @@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index c9e30a451da..3ef918dc842 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -14,7 +14,7 @@ * limitations under the License. */ -/* Triangle/Ray intersections . +/* Triangle/Ray intersections. * * For BVH ray intersection we use a precomputed triangle storage to accelerate * intersection at the cost of more memory usage. @@ -49,18 +49,27 @@ typedef struct IsectPrecalc { float Sx, Sy, Sz; } IsectPrecalc; -/* Workaround for CUDA toolkit 6.5.16. */ -#if defined(__KERNEL_CPU__) || !defined(__KERNEL_CUDA_EXPERIMENTAL__) || __CUDA_ARCH__ < 500 +#if defined(__KERNEL_CUDA__) # if (defined(i386) || defined(_M_IX86)) +# if __CUDA_ARCH__ > 500 ccl_device_noinline -# else +# else /* __CUDA_ARCH__ > 500 */ ccl_device_inline -# endif -#else +# endif /* __CUDA_ARCH__ > 500 */ +# else /* (defined(i386) || defined(_M_IX86)) */ +# if defined(__KERNEL_EXPERIMENTAL__) && (__CUDA_ARCH__ >= 500) ccl_device_noinline -#endif +# else +ccl_device_inline +# endif +# endif /* (defined(i386) || defined(_M_IX86)) */ +#elif defined(__KERNEL_OPENCL_APPLE__) +ccl_device_noinline +#else /* defined(__KERNEL_OPENCL_APPLE__) */ +ccl_device_inline +#endif /* defined(__KERNEL_OPENCL_APPLE__) */ void triangle_intersect_precalc(float3 dir, - IsectPrecalc *isect_precalc) + IsectPrecalc *isect_precalc) { /* Calculate dimension where the ray direction is maximal. */ int kz = util_max_axis(make_float3(fabsf(dir.x), @@ -77,10 +86,10 @@ void triangle_intersect_precalc(float3 dir, } /* Calculate the shear constants. */ - float inf_dir_z = 1.0f / IDX(dir, kz); - isect_precalc->Sx = IDX(dir, kx) * inf_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inf_dir_z; - isect_precalc->Sz = inf_dir_z; + float inv_dir_z = 1.0f / IDX(dir, kz); + isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; + isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; + isect_precalc->Sz = inv_dir_z; /* Store the dimensions. */ isect_precalc->kx = kx; @@ -98,7 +107,6 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, const IsectPrecalc *isect_precalc, Intersection *isect, float3 P, - float3 dir, uint visibility, int object, int triAddr) @@ -111,14 +119,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2)); - - const float3 A = tri[0] - P; - const float3 B = tri[1] - P; - const float3 C = tri[2] - P; + const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); + const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); + const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); @@ -155,8 +161,8 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, */ const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; const float sign_T = xor_signmast(T, sign_mask); - if ((sign_T < 0.0f) || - (sign_T > isect->t * xor_signmast(det, sign_mask))) + if((sign_T < 0.0f) || + (sign_T > isect->t * xor_signmast(det, sign_mask))) { return false; } @@ -191,7 +197,6 @@ ccl_device_inline void triangle_intersect_subsurface( const IsectPrecalc *isect_precalc, Intersection *isect_array, float3 P, - float3 dir, int object, int triAddr, float tmax, @@ -207,14 +212,12 @@ ccl_device_inline void triangle_intersect_subsurface( const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2)); - - const float3 A = tri[0] - P; - const float3 B = tri[1] - P; - const float3 C = tri[2] - P; + const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); + const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); + const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); @@ -249,13 +252,10 @@ ccl_device_inline void triangle_intersect_subsurface( /* Calculate scaled z−coordinates of vertices and use them to calculate * the hit distance. */ - const float Az = Sz * A_kz; - const float Bz = Sz * B_kz; - const float Cz = Sz * C_kz; - const float T = U * Az + V * Bz + W * Cz; - - if ((xor_signmast(T, sign_mask) < 0.0f) || - (xor_signmast(T, sign_mask) > tmax * xor_signmast(det, sign_mask))) + const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; + const float sign_T = xor_signmast(T, sign_mask); + if((sign_T < 0.0f) || + (sign_T > tmax * xor_signmast(det, sign_mask))) { return; } @@ -315,7 +315,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, return P; } #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; + Transform tfm = ccl_fetch(sd, ob_itfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -327,14 +327,12 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D*t; - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2)); - - float3 edge1 = tri[0] - tri[2]; - float3 edge2 = tri[1] - tri[2]; - float3 tvec = P - tri[2]; + const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2); + float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); + float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); + float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); float3 qvec = cross(tvec, edge1); float3 pvec = cross(D, edge2); float rt = dot(edge2, qvec) / dot(edge1, pvec); @@ -343,7 +341,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; + Transform tfm = ccl_fetch(sd, ob_tfm); #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif @@ -372,7 +370,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, #ifdef __INTERSECTION_REFINE__ if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; + Transform tfm = ccl_fetch(sd, ob_itfm); #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -386,14 +384,12 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, P = P + D*t; - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2)); - - float3 edge1 = tri[0] - tri[2]; - float3 edge2 = tri[1] - tri[2]; - float3 tvec = P - tri[2]; + const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2); + float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); + float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); + float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); float3 qvec = cross(tvec, edge1); float3 pvec = cross(D, edge2); float rt = dot(edge2, qvec) / dot(edge1, pvec); @@ -402,7 +398,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; + Transform tfm = ccl_fetch(sd, ob_tfm); #else Transform tfm = object_fetch_transform(kg, isect->object, diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index c33509fbf4f..c72afa2a3a4 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -60,7 +60,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, #endif if(dx) *dx = 0.0f; - if(dx) *dy = 0.0f; + if(dy) *dy = 0.0f; /* todo: support float textures to lower memory usage for single floats */ return average(float4_to_float3(r)); diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 369c615eade..2dc87fffcbc 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) #endif } -ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput, +ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label) { float inverse_pdf = 1.0f/bsdf_pdf; @@ -341,12 +341,12 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L) { - float3 L_sum, L_direct, L_indirect; - float clamp_direct = kernel_data.integrator.sample_clamp_direct; - float clamp_indirect = kernel_data.integrator.sample_clamp_indirect; - + float3 L_sum; /* Light Passes are used */ #ifdef __PASSES__ + float3 L_direct, L_indirect; + float clamp_direct = kernel_data.integrator.sample_clamp_direct; + float clamp_indirect = kernel_data.integrator.sample_clamp_indirect; if(L->use_light_pass) { path_radiance_sum_indirect(L); diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 20d7a143c67..2fca83c615f 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -57,7 +57,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian /* sample subsurface scattering */ if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ - if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput)) + if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput)) is_sss_sample = true; } #endif @@ -208,7 +208,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, filter_x = filter_y = 0.5f; } else { - path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_x); + path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); } /* subpixel u/v offset */ @@ -259,7 +259,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* data passes */ case SHADER_EVAL_NORMAL: { - if ((sd.flag & SD_HAS_BUMP)) { + if((sd.flag & SD_HAS_BUMP)) { shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN); } diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index ded222e20ff..3ce5134181a 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -16,17 +16,6 @@ CCL_NAMESPACE_BEGIN -/* Workaround for explicit conversion from constant to private memory - * pointer when using OpenCL. - * - * TODO(sergey): Find a real solution for this. - */ -#ifdef __KERNEL_OPENCL__ -# define __motion_as_decoupled_const_ptr(motion) ((motion)) -#else -# define __motion_as_decoupled_const_ptr(motion) ((const DecompMotionTransform*)(motion)) -#endif - /* Perspective Camera */ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) @@ -50,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) return bokeh; } -ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ Transform rastertocamera = kernel_data.cam.rastertocamera; @@ -80,9 +69,16 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo #ifdef __CAMERA_MOTION__ if(kernel_data.cam.have_motion) { +#ifdef __KERNEL_OPENCL__ + const MotionTransform tfm = kernel_data.cam.motion; transform_motion_interpolate(&cameratoworld, - __motion_as_decoupled_const_ptr(&kernel_data.cam.motion), + ((const DecompMotionTransform*)&tfm), ray->time); +#else + transform_motion_interpolate(&cameratoworld, + ((const DecompMotionTransform*)&kernel_data.cam.motion), + ray->time); +#endif } #endif @@ -112,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo } /* Orthographic Camera */ - -ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ Transform rastertocamera = kernel_data.cam.rastertocamera; @@ -144,9 +139,16 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl #ifdef __CAMERA_MOTION__ if(kernel_data.cam.have_motion) { +#ifdef __KERNEL_OPENCL__ + const MotionTransform tfm = kernel_data.cam.motion; transform_motion_interpolate(&cameratoworld, - __motion_as_decoupled_const_ptr(&kernel_data.cam.motion), + (const DecompMotionTransform*)&tfm, ray->time); +#else + transform_motion_interpolate(&cameratoworld, + (const DecompMotionTransform*)&kernel_data.cam.motion, + ray->time); +#endif } #endif @@ -172,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl /* Panorama Camera */ -ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { Transform rastertocamera = kernel_data.cam.rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); @@ -220,10 +222,18 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float Transform cameratoworld = kernel_data.cam.cameratoworld; #ifdef __CAMERA_MOTION__ - if(kernel_data.cam.have_motion) + if(kernel_data.cam.have_motion) { +#ifdef __KERNEL_OPENCL__ + const MotionTransform tfm = kernel_data.cam.motion; transform_motion_interpolate(&cameratoworld, - __motion_as_decoupled_const_ptr(&kernel_data.cam.motion), + (const DecompMotionTransform*)&tfm, ray->time); +#else + transform_motion_interpolate(&cameratoworld, + (const DecompMotionTransform*)&kernel_data.cam.motion, + ray->time); +#endif + } #endif ray->P = transform_point(&cameratoworld, ray->P); @@ -245,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* Common */ ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v, - float lens_u, float lens_v, float time, Ray *ray) + float lens_u, float lens_v, float time, ccl_addr_space Ray *ray) { /* pixel filter */ int filter_table_offset = kernel_data.film.filter_table_offset; @@ -308,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); Transform tfm = kernel_data.cam.worldtondc; @@ -318,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); @@ -329,7 +339,4 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, } } -#undef __motion_as_decoupled_const_ptr - CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 200667a0911..0bf1ed36d1e 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -24,6 +24,15 @@ */ #if defined(__GNUC__) && defined(NDEBUG) # pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +# pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +/* Selective nodes compilation. */ +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL #endif #include "util_debug.h" @@ -32,6 +41,8 @@ #include "util_half.h" #include "util_types.h" +#define ccl_addr_space + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 904736c190c..9fdd3abfec3 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -22,6 +22,14 @@ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END +/* Selective nodes compilation. */ +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + #include <cuda.h> #include <float.h> @@ -33,6 +41,7 @@ #define ccl_global #define ccl_constant #define ccl_may_alias +#define ccl_addr_space /* No assert supported for CUDA */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index d480ec0f270..e8b36d2605d 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -37,6 +37,22 @@ #define ccl_may_alias #define ccl_constant __constant #define ccl_global __global +#define ccl_local __local +#define ccl_private __private + +#ifdef __SPLIT_KERNEL__ +#define ccl_addr_space __global +#else +#define ccl_addr_space +#endif + +/* Selective nodes compilation. */ +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif /* no assert in opencl */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h index f532442ba41..24d6458567e 100644 --- a/intern/cycles/kernel/kernel_debug.h +++ b/intern/cycles/kernel/kernel_debug.h @@ -19,11 +19,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void debug_data_init(DebugData *debug_data) { debug_data->num_bvh_traversal_steps = 0; + debug_data->num_bvh_traversed_instances = 0; + debug_data->num_ray_bounces = 0; } ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, ccl_global float *buffer, - PathState *state, + ccl_addr_space PathState *state, DebugData *debug_data, int sample) { @@ -33,6 +35,16 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, sample, debug_data->num_bvh_traversal_steps); } + if(flag & PASS_BVH_TRAVERSED_INSTANCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, + sample, + debug_data->num_bvh_traversed_instances); + } + if(flag & PASS_RAY_BOUNCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, + sample, + debug_data->num_ray_bounces); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h index e5fbd5b450e..ae1e70f0167 100644 --- a/intern/cycles/kernel/kernel_differential.h +++ b/intern/cycles/kernel/kernel_differential.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN /* See "Tracing Ray Differentials", Homan Igehy, 1999. */ -ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t) +ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t) { /* ray differential transfer through homogeneous medium, to * compute dPdx/dy at a shading point from the incoming ray */ @@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP dP_->dy = tmpy - dot(tmpy, Ng)*tmp; } -ccl_device void differential_incoming(differential3 *dI, const differential3 dD) +ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD) { /* compute dIdx/dy at a shading point, we just need to negate the * differential of the ray direction */ @@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD) dI->dy = -dD.dy; } -ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng) +ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng) { /* now we have dPdx/dy from the ray differential transfer, and dPdu/dv * from the primitive, we can compute dudx/dy and dvdx/dy. these are diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 7523105607f..de9e8d77ec8 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -17,12 +17,20 @@ CCL_NAMESPACE_BEGIN /* Direction Emission */ - ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, - LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce) + LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd_input +#endif +) { /* setup shading at emitter */ - ShaderData sd; +#ifdef __SPLIT_KERNEL__ + ShaderData *sd = sd_input; +#else + ShaderData sd_object; + ShaderData *sd = &sd_object; +#endif float3 eval; #ifdef __BACKGROUND_MIS__ @@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ray.dP = differential3_zero(); ray.dD = dI; - shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce); - eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION); + shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce); + eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION); } else #endif { - shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce); + shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce); - ls->Ng = sd.Ng; + ls->Ng = ccl_fetch(sd, Ng); /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ - shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION); /* evaluate emissive closure */ - if(sd.flag & SD_EMISSION) - eval = shader_emissive_eval(kg, &sd); + if(ccl_fetch(sd, flag) & SD_EMISSION) + eval = shader_emissive_eval(kg, sd); else eval = make_float3(0.0f, 0.0f, 0.0f); } @@ -65,7 +73,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp, - int bounce, int transparent_bounce) + int bounce, int transparent_bounce +#ifdef __SPLIT_KERNEL__ + , ShaderData *sd_DL +#endif + ) { if(ls->pdf == 0.0f) return false; @@ -74,7 +86,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, differential3 dD = differential3_zero(); /* evaluate closure */ - float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce); + + float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time), + bounce, + transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,sd_DL +#endif + ); if(is_zero(light_eval)) return false; @@ -83,7 +102,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, float bsdf_pdf; #ifdef __VOLUME__ - if(sd->prim != PRIM_NONE) + if(ccl_fetch(sd, prim) != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); else shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf); @@ -118,8 +137,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(sd->Ng, ls->D) < 0.0f); - ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); + bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); + ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); if(ls->t == FLT_MAX) { /* distant light */ @@ -132,7 +151,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = differential3_zero(); } else { @@ -154,14 +173,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); + float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -172,7 +191,11 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader /* Indirect Lamp Emission */ -ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission) +ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd +#endif + ) { bool hit_lamp = false; @@ -188,14 +211,21 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st /* use visibility flag to skip lights */ if(ls.shader & SHADER_EXCLUDE_ANY) { if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || - ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_REFLECT)) || + ((ls.shader & SHADER_EXCLUDE_GLOSSY) && + ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) || ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) continue; } #endif - float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce); + float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, + state->bounce, + state->transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,sd +#endif + ); #ifdef __VOLUME__ if(state->volume_stack[0].shader != SHADER_NONE) { @@ -224,7 +254,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st /* Indirect Background */ -ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray) +ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd_global +#endif + ) { #ifdef __BACKGROUND__ int shader = kernel_data.background.surface_shader; @@ -232,18 +266,25 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta /* use visibility flag to skip lights */ if(shader & SHADER_EXCLUDE_ANY) { if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || - ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_REFLECT)) || + ((shader & SHADER_EXCLUDE_GLOSSY) && + ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) || ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) || ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) return make_float3(0.0f, 0.0f, 0.0f); } +#ifdef __SPLIT_KERNEL__ /* evaluate background closure */ + Ray priv_ray = *ray; + shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce); + float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION); +#else ShaderData sd; shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce); float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION); +#endif #ifdef __BACKGROUND_MIS__ /* check if background light exists or if we should skip pdf */ @@ -252,7 +293,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta if(!(state->flag & PATH_RAY_MIS_SKIP) && res) { /* multiple importance sampling, get background light pdf for ray * direction, and compute weight with respect to BSDF pdf */ - float pdf = background_light_pdf(kg, ray->D); + float pdf = background_light_pdf(kg, ray->P, ray->D); float mis_weight = power_heuristic(state->ray_pdf, pdf); return L*mis_weight; diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h index 4668b40b86d..f9e9b413898 100644 --- a/intern/cycles/kernel/kernel_film.h +++ b/intern/cycles/kernel/kernel_film.h @@ -27,7 +27,7 @@ ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale) result.z = color_scene_linear_to_srgb(result.z*exposure); /* clamp since alpha might be > 1.0 due to russian roulette */ - result.w = clamp(result.w, 0.0f, 1.0f); + result.w = saturate(result.w); return result; } @@ -37,10 +37,10 @@ ccl_device uchar4 film_float_to_byte(float4 color) uchar4 result; /* simple float to byte conversion */ - result.x = (uchar)clamp(color.x*255.0f, 0.0f, 255.0f); - result.y = (uchar)clamp(color.y*255.0f, 0.0f, 255.0f); - result.z = (uchar)clamp(color.z*255.0f, 0.0f, 255.0f); - result.w = (uchar)clamp(color.w*255.0f, 0.0f, 255.0f); + result.x = (uchar)(saturate(color.x)*255.0f); + result.y = (uchar)(saturate(color.y)*255.0f); + result.z = (uchar)(saturate(color.z)*255.0f); + result.w = (uchar)(saturate(color.w)*255.0f); return result; } diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 0a9753baca2..17fa18909c4 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals; #ifdef __KERNEL_OPENCL__ -typedef struct KernelGlobals { +typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; #define KERNEL_TEX(type, ttype, name) \ @@ -94,7 +94,7 @@ typedef struct KernelGlobals { ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size) { - x = clamp(x, 0.0f, 1.0f)*(size-1); + x = saturate(x)*(size-1); int index = min(float_to_int(x), size-1); int nindex = min(index+1, size-1); @@ -110,7 +110,7 @@ ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int s ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize) { - y = clamp(y, 0.0f, 1.0f)*(ysize-1); + y = saturate(y)*(ysize-1); int index = min(float_to_int(y), ysize-1); int nindex = min(index+1, ysize-1); diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 6953f005ea9..9ba41635b9e 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -128,7 +128,7 @@ ccl_device_inline uint cmj_permute(uint i, uint l, uint p) i *= 0xc860a3df; i &= w; i ^= i >> 5; - } while (i >= l); + } while(i >= l); return (i + p) % l; } diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 76fa754b5fa..1badbc3b9f7 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -33,6 +33,98 @@ typedef struct LightSample { LightType type; /* type of light */ } LightSample; +/* Area light sampling */ + +/* Uses the following paper: + * + * Carlos Urena et al. + * An Area-Preserving Parametrization for Spherical Rectangles. + * + * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf + * + * Note: light_p is modified when sample_coord is true. + */ +ccl_device float area_light_sample(float3 P, + float3 *light_p, + float3 axisu, float3 axisv, + float randu, float randv, + bool sample_coord) +{ + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if(z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + /* Create vectors to four vertices. */ + float3 v00 = make_float3(x0, y0, z0); + float3 v01 = make_float3(x0, y1, z0); + float3 v10 = make_float3(x1, y0, z0); + float3 v11 = make_float3(x1, y1, z0); + /* Compute normals to edges. */ + float3 n0 = normalize(cross(v00, v10)); + float3 n1 = normalize(cross(v10, v11)); + float3 n2 = normalize(cross(v11, v01)); + float3 n3 = normalize(cross(v01, v00)); + /* Compute internal angles (gamma_i). */ + float g0 = safe_acosf(-dot(n0, n1)); + float g1 = safe_acosf(-dot(n1, n2)); + float g2 = safe_acosf(-dot(n2, n3)); + float g3 = safe_acosf(-dot(n3, n0)); + /* Compute predefined constants. */ + float b0 = n0.z; + float b1 = n2.z; + float b0sq = b0 * b0; + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + + if(sample_coord) { + /* Compute cu. */ + float au = randu * S + k; + float fu = (cosf(au) * b0 - b1) / sinf(au); + float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); + cu = clamp(cu, -1.0f, 1.0f); + /* Compute xu. */ + float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + xu = clamp(xu, x0, x1); + /* Compute yv. */ + float z0sq = z0 * z0; + float y0sq = y0 * y0; + float y1sq = y1 * y1; + float d = sqrtf(xu * xu + z0sq); + float h0 = y0 / sqrtf(d * d + y0sq); + float h1 = y1 / sqrtf(d * d + y1sq); + float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; + float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; + + /* Transform (xu, yv, z0) to world coords. */ + *light_p = P + xu * x + yv * y + z0 * z; + } + + /* return pdf */ + if(S != 0.0f) + return 1.0f / S; + else + return 0.0f; +} + /* Background Light */ #ifdef __BACKGROUND_MIS__ @@ -46,7 +138,7 @@ ccl_device_noinline #else ccl_device #endif -float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf) +float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) { /* for the following, the CDF values are actually a pair of floats, with the * function value as X and the actual CDF as Y. The last entry's function @@ -116,10 +208,8 @@ float3 background_light_sample(KernelGlobals *kg, float randu, float randv, floa else *pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom); - *pdf *= kernel_data.integrator.pdf_lights; - /* compute direction */ - return -equirectangular_to_direction(u, v); + return equirectangular_to_direction(u, v); } /* TODO(sergey): Same as above, after the release we should consider using @@ -130,7 +220,7 @@ ccl_device_noinline #else ccl_device #endif -float background_light_pdf(KernelGlobals *kg, float3 direction) +float background_map_pdf(KernelGlobals *kg, float3 direction) { float2 uv = direction_to_equirectangular(direction); int res = kernel_data.integrator.pdf_background_res; @@ -156,9 +246,223 @@ float background_light_pdf(KernelGlobals *kg, float3 direction) float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, index_v * (res + 1) + index_u); float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v); - float pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom); + return (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom); +} + +ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals *kg, + float3 P, + int index, + float3 *lightpos, + float3 *dir) +{ + float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0); + float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3); + + *lightpos = make_float3(data0.y, data0.z, data0.w); + *dir = make_float3(data3.y, data3.z, data3.w); + + /* Check whether portal is on the right side. */ + if(dot(*dir, P - *lightpos) > 1e-5f) + return true; + + return false; +} + +ccl_device float background_portal_pdf(KernelGlobals *kg, + float3 P, + float3 direction, + int ignore_portal, + bool *is_possible) +{ + float portal_pdf = 0.0f; + + for(int p = 0; p < kernel_data.integrator.num_portals; p++) { + if(p == ignore_portal) + continue; + + float3 lightpos, dir; + if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + continue; + + if(is_possible) { + /* There's a portal that could be sampled from this position. */ + *is_possible = true; + } + + float t = -(dot(P, dir) - dot(lightpos, dir)) / dot(direction, dir); + if(t <= 1e-5f) { + /* Either behind the portal or too close. */ + continue; + } - return pdf * kernel_data.integrator.pdf_lights; + float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1); + float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2); + + float3 axisu = make_float3(data1.y, data1.z, data1.w); + float3 axisv = make_float3(data2.y, data2.z, data2.w); + + float3 hit = P + t*direction; + float3 inplane = hit - lightpos; + /* Skip if the the ray doesn't pass through portal. */ + if(fabsf(dot(inplane, axisu) / dot(axisu, axisu)) > 0.5f) + continue; + if(fabsf(dot(inplane, axisv) / dot(axisv, axisv)) > 0.5f) + continue; + + portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false); + } + + return kernel_data.integrator.num_portals? portal_pdf / kernel_data.integrator.num_portals: 0.0f; +} + +ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P) +{ + int num_possible_portals = 0; + for(int p = 0; p < kernel_data.integrator.num_portals; p++) { + float3 lightpos, dir; + if(background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + num_possible_portals++; + } + return num_possible_portals; +} + +ccl_device float3 background_portal_sample(KernelGlobals *kg, + float3 P, + float randu, + float randv, + int num_possible, + int *sampled_portal, + float *pdf) +{ + /* Pick a portal, then re-normalize randv. */ + randv *= num_possible; + int portal = (int)randv; + randv -= portal; + + /* TODO(sergey): Some smarter way of finding portal to sample + * is welcome. + */ + for(int p = 0; p < kernel_data.integrator.num_portals; p++) { + /* Search for the sampled portal. */ + float3 lightpos, dir; + if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + continue; + + if(portal == 0) { + /* p is the portal to be sampled. */ + float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1); + float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2); + float3 axisu = make_float3(data1.y, data1.z, data1.w); + float3 axisv = make_float3(data2.y, data2.z, data2.w); + + *pdf = area_light_sample(P, &lightpos, + axisu, axisv, + randu, randv, + true); + + *pdf /= num_possible; + *sampled_portal = p; + return normalize(lightpos - P); + } + + portal--; + } + + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) +{ + /* Probability of sampling portals instead of the map. */ + float portal_sampling_pdf = kernel_data.integrator.portal_pdf; + + /* Check if there are portals in the scene which we can sample. */ + if(portal_sampling_pdf > 0.0f) { + int num_portals = background_num_possible_portals(kg, P); + if(num_portals > 0) { + if(portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) { + if(portal_sampling_pdf < 1.0f) { + randu /= portal_sampling_pdf; + } + int portal; + float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf); + if(num_portals > 1) { + /* Ignore the chosen portal, its pdf is already included. */ + *pdf += background_portal_pdf(kg, P, D, portal, NULL); + } + /* We could also have sampled the map, so combine with MIS. */ + if(portal_sampling_pdf < 1.0f) { + float cdf_pdf = background_map_pdf(kg, D); + *pdf = (portal_sampling_pdf * (*pdf) + + (1.0f - portal_sampling_pdf) * cdf_pdf); + } + return D; + } else { + /* Sample map, but with nonzero portal_sampling_pdf for MIS. */ + randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf); + } + } else { + /* We can't sample a portal. + * Check if we can sample the map instead. + */ + if(portal_sampling_pdf == 1.0f) { + /* Use uniform as a fallback if we can't sample the map. */ + *pdf = 1.0f / M_4PI_F; + return sample_uniform_sphere(randu, randv); + } + else { + portal_sampling_pdf = 0.0f; + } + } + } + + float3 D = background_map_sample(kg, randu, randv, pdf); + /* Use MIS if portals could be sampled as well. */ + if(portal_sampling_pdf > 0.0f) { + float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL); + *pdf = (portal_sampling_pdf * portal_pdf + + (1.0f - portal_sampling_pdf) * (*pdf)); + } + return D; +} + +ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction) +{ + /* Probability of sampling portals instead of the map. */ + float portal_sampling_pdf = kernel_data.integrator.portal_pdf; + + if(portal_sampling_pdf > 0.0f) { + bool is_possible = false; + float portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible); + if(portal_pdf == 0.0f) { + if(portal_sampling_pdf == 1.0f) { + /* If there are no possible portals at this point, + * the fallback sampling would have been used. + * Otherwise, the direction would not be sampled at all => pdf = 0 + */ + return is_possible? 0.0f: kernel_data.integrator.pdf_lights / M_4PI_F; + } + else { + /* We can only sample the map. */ + return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights; + } + } else { + if(portal_sampling_pdf == 1.0f) { + /* We can only sample portals. */ + return portal_pdf * kernel_data.integrator.pdf_lights; + } + else { + /* We can sample both, so combine with MIS. */ + return (background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf) + + portal_pdf * portal_sampling_pdf) * kernel_data.integrator.pdf_lights; + } + } + } + + /* No portals in the scene, so must sample the map. + * At least one of them is always possible if we have a LIGHT_BACKGROUND. + */ + return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights; } #endif @@ -184,96 +488,6 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo return disk_light_sample(normalize(P - center), randu, randv)*radius; } -/* Uses the following paper: - * - * Carlos Urena et al. - * An Area-Preserving Parametrization for Spherical Rectangles. - * - * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf - * - * Note: light_p is modified when sample_coord is true. - */ -ccl_device float area_light_sample(float3 P, - float3 *light_p, - float3 axisu, float3 axisv, - float randu, float randv, - bool sample_coord) -{ - /* In our name system we're using P for the center, - * which is o in the paper. - */ - - float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; - float axisu_len, axisv_len; - /* Compute local reference system R. */ - float3 x = normalize_len(axisu, &axisu_len); - float3 y = normalize_len(axisv, &axisv_len); - float3 z = cross(x, y); - /* Compute rectangle coords in local reference system. */ - float3 dir = corner - P; - float z0 = dot(dir, z); - /* Flip 'z' to make it point against Q. */ - if(z0 > 0.0f) { - z *= -1.0f; - z0 *= -1.0f; - } - float x0 = dot(dir, x); - float y0 = dot(dir, y); - float x1 = x0 + axisu_len; - float y1 = y0 + axisv_len; - /* Create vectors to four vertices. */ - float3 v00 = make_float3(x0, y0, z0); - float3 v01 = make_float3(x0, y1, z0); - float3 v10 = make_float3(x1, y0, z0); - float3 v11 = make_float3(x1, y1, z0); - /* Compute normals to edges. */ - float3 n0 = normalize(cross(v00, v10)); - float3 n1 = normalize(cross(v10, v11)); - float3 n2 = normalize(cross(v11, v01)); - float3 n3 = normalize(cross(v01, v00)); - /* Compute internal angles (gamma_i). */ - float g0 = safe_acosf(-dot(n0, n1)); - float g1 = safe_acosf(-dot(n1, n2)); - float g2 = safe_acosf(-dot(n2, n3)); - float g3 = safe_acosf(-dot(n3, n0)); - /* Compute predefined constants. */ - float b0 = n0.z; - float b1 = n2.z; - float b0sq = b0 * b0; - float k = M_2PI_F - g2 - g3; - /* Compute solid angle from internal angles. */ - float S = g0 + g1 - k; - - if(sample_coord) { - /* Compute cu. */ - float au = randu * S + k; - float fu = (cosf(au) * b0 - b1) / sinf(au); - float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); - cu = clamp(cu, -1.0f, 1.0f); - /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); - xu = clamp(xu, x0, x1); - /* Compute yv. */ - float z0sq = z0 * z0; - float y0sq = y0 * y0; - float y1sq = y1 * y1; - float d = sqrtf(xu * xu + z0sq); - float h0 = y0 / sqrtf(d * d + y0sq); - float h1 = y1 / sqrtf(d * d + y1sq); - float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; - float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; - - /* Transform (xu, yv, z0) to world coords. */ - *light_p = P + xu * x + yv * y + z0 * z; - } - - /* return pdf */ - if(S != 0.0f) - return 1.0f / S; - else - return 0.0f; -} - ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls) { float3 dir = make_float3(data2.y, data2.z, data2.w); @@ -344,13 +558,14 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, #ifdef __BACKGROUND_MIS__ else if(type == LIGHT_BACKGROUND) { /* infinite area light (e.g. light dome or env light) */ - float3 D = background_light_sample(kg, randu, randv, &ls->pdf); + float3 D = -background_light_sample(kg, P, randu, randv, &ls->pdf); ls->P = D; ls->Ng = D; ls->D = -D; ls->t = FLT_MAX; ls->eval_fac = 1.0f; + ls->pdf *= kernel_data.integrator.pdf_lights; } #endif else { diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 6bb39ee485d..20cf3fa931b 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) { ccl_global float *buf = buffer; +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + atomic_add_float(buf, value); +#else *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) { +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + ccl_global float *buf_x = buffer + 0; + ccl_global float *buf_y = buffer + 1; + ccl_global float *buf_z = buffer + 2; + + atomic_add_float(buf_x, value.x); + atomic_add_float(buf_y, value.y); + atomic_add_float(buf_z, value.z); +#else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) { +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + ccl_global float *buf_x = buffer + 0; + ccl_global float *buf_y = buffer + 1; + ccl_global float *buf_z = buffer + 2; + ccl_global float *buf_w = buffer + 3; + + atomic_add_float(buf_x, value.x); + atomic_add_float(buf_y, value.y); + atomic_add_float(buf_z, value.z); + atomic_add_float(buf_w, value.w); +#else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, PathState *state, float3 throughput) + ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { #ifdef __PASSES__ int path_flag = state->flag; @@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(sd->flag & SD_TRANSPARENT) || + if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { if(sample == 0) { if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, sd->P); + float depth = camera_distance(kg, ccl_fetch(sd, P)); kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); } if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, sd->object); + float id = object_pass_id(kg, ccl_fetch(sd, object)); kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); } if(flag & PASS_MATERIAL_ID) { @@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = sd->N; + float3 normal = ccl_fetch(sd, N); kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -101,8 +127,8 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, sd->P); - float mist = clamp((depth - mist_start)*mist_inv_depth, 0.0f, 1.0f); + float depth = camera_distance(kg, ccl_fetch(sd, P)); + float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ float mist_falloff = kernel_data.film.mist_falloff; diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 9b9495644dd..9794ad1d180 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -42,6 +42,7 @@ #include "kernel_path_state.h" #include "kernel_shadow.h" #include "kernel_emission.h" +#include "kernel_path_common.h" #include "kernel_path_surface.h" #include "kernel_path_volume.h" @@ -273,8 +274,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false); - - state.flag |= PATH_RAY_BSSRDF_ANCESTOR; } } #endif @@ -307,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance * sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(sd->P, sd->Ng); + light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif - light_ray.dP = sd->dP; + light_ray.dP = ccl_fetch(sd, dP); light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) @@ -325,70 +324,8 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance * } } -ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput) -{ - int num_samples = kernel_data.integrator.ao_samples; - float num_samples_inv = 1.0f/num_samples; - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - float3 ao_alpha = shader_bsdf_alpha(kg, sd); - - for(int j = 0; j < num_samples; j++) { - float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float3 ao_D; - float ao_pdf; - - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray light_ray; - float3 ao_shadow; - - light_ray.P = ray_offset(sd->P, sd->Ng); - light_ray.D = ao_D; - light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - light_ray.dP = sd->dP; - light_ray.dD = differential3_zero(); - - if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); - } - } -} - #ifdef __SUBSURFACE__ -#ifdef __VOLUME__ -ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg, - Ray *ray, - VolumeStack *stack) -{ - kernel_assert(kernel_data.integrator.use_volumes); - - Ray volume_ray = *ray; - Intersection isect; - int step = 0; - while(step < VOLUME_STACK_SIZE && - scene_intersect_volume(kg, &volume_ray, &isect)) - { - ShaderData sd; - shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); - kernel_volume_stack_enter_exit(kg, &sd, stack); - - /* Move ray forward. */ - volume_ray.P = ray_offset(sd.P, -sd.Ng); - volume_ray.t -= sd.ray_length; - ++step; - } -} -#endif - ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput) { float bssrdf_probability; @@ -408,7 +345,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd #ifdef __VOLUME__ Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->flag & SD_OBJECT_INTERSECTS_VOLUME; + ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; #endif /* compute lighting with the BSDF closure */ @@ -417,7 +354,6 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd PathState hit_state = *state; Ray hit_ray = *ray; - hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR; hit_state.rng_offset += PRNG_BOUNCE_NUM; kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L); @@ -433,7 +369,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd volume_ray.D = normalize_len(hit_ray.P - volume_ray.P, &volume_ray.t); - kernel_path_subsurface_update_volume_stack( + kernel_volume_stack_update_for_subsurface( kg, &volume_ray, hit_state.volume_stack); @@ -503,7 +439,9 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, #ifdef __KERNEL_DEBUG__ if(state.flag & PATH_RAY_CAMERA) { debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; } + debug_data.num_ray_bounces++; #endif #ifdef __LAMP_MIS__ @@ -733,474 +671,6 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } -#ifdef __BRANCHED_PATH__ - -/* branched path tracing: bounce off surface and integrate indirect light */ -ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust, - PathState *state, PathRadiance *L) -{ - for(int i = 0; i< sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if(!CLOSURE_IS_BSDF(sc->type)) - continue; - /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) - continue; - - int num_samples; - - if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) - num_samples = kernel_data.integrator.diffuse_samples; - else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) - num_samples = 1; - else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) - num_samples = kernel_data.integrator.glossy_samples; - else - num_samples = kernel_data.integrator.transmission_samples; - - num_samples = ceil_to_int(num_samples_adjust*num_samples); - - float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(*rng, i); - - for(int j = 0; j < num_samples; j++) { - PathState ps = *state; - float3 tp = throughput; - Ray bsdf_ray; - - if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray)) - continue; - - kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - } - } -} - -#ifdef __SUBSURFACE__ -ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, - ShaderData *sd, - PathRadiance *L, - PathState *state, - RNG *rng, - Ray *ray, - float3 throughput) -{ - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(!CLOSURE_IS_BSSRDF(sc->type)) - continue; - - /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - int num_samples = kernel_data.integrator.subsurface_samples; - float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); - - state->flag |= PATH_RAY_BSSRDF_ANCESTOR; - - /* do subsurface scatter step with copy of shader data, this will - * replace the BSSRDF with a diffuse BSDF closure */ - for(int j = 0; j < num_samples; j++) { - ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; - float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); -#ifdef __VOLUME__ - Ray volume_ray = *ray; - bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->flag & SD_OBJECT_INTERSECTS_VOLUME; -#endif - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - PathState hit_state = *state; - - path_state_branch(&hit_state, j, num_samples); - -#ifdef __VOLUME__ - if(need_update_volume_stack) { - /* Setup ray from previous surface point to the new one. */ - float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng); - volume_ray.D = normalize_len(P - volume_ray.P, - &volume_ray.t); - - kernel_path_subsurface_update_volume_stack( - kg, - &volume_ray, - hit_state.volume_stack); - - /* Move volume ray forward. */ - volume_ray.P = P; - } -#endif - -#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) - /* direct light */ - if(kernel_data.integrator.use_direct_light) { - bool all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, rng, - &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all); - } -#endif - - /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, - &bssrdf_sd[hit], throughput, num_samples_inv, - &hit_state, L); - } - } - - state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR; - } -} -#endif - -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) -{ - /* initialize */ - PathRadiance L; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - - path_radiance_init(&L, kernel_data.film.use_light_pass); - - PathState state; - path_state_init(kg, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif - - for(;;) { - /* intersect scene */ - Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); - } - - bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif - -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; - } -#endif - -#ifdef __VOLUME__ - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -#ifdef __VOLUME_DECOUPLED__ - /* decoupled ray marching only supported on CPU */ - - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - ShaderData volume_sd; - - shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &volume_sd, &volume_segment, heterogeneous); - - /* direct light sampling */ - if(volume_segment.closure_flag & SD_SCATTER) { - volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - - bool all = kernel_data.integrator.sample_all_lights_direct; - - kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, - throughput, &state, &L, all, &volume_ray, &volume_segment); - - /* indirect light sampling */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); - - PathState ps = state; - Ray pray = ray; - float3 tp = throughput; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - /* scatter sample. if we use distance sampling and take just one - * sample for direct and indirect light, we could share this - * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false); - - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - - if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { - kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } - } - - /* emission and transmittance */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); - throughput *= volume_segment.accum_transmittance; - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); -#else - /* GPU: no decoupled ray marching, scatter probalistically */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - /* todo: we should cache the shader evaluations from stepping - * through the volume, for now we redo them multiple times */ - - for(int j = 0; j < num_samples; j++) { - PathState ps = state; - Ray pray = ray; - ShaderData volume_sd; - float3 tp = throughput * num_samples_inv; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous); - -#ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* todo: support equiangular, MIS and all light sampling. - * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L); - - if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { - kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } -#endif - } - - /* todo: avoid this calculation using decoupled ray marching */ - kernel_volume_shadow(kg, &state, &volume_ray, &throughput); -#endif - } -#endif - - if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); -#endif - - break; - } - - /* setup shading */ - ShaderData sd; - shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); - shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN); - shader_merge_closures(&sd); - - /* holdout */ -#ifdef __HOLDOUT__ - if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - - if(sd.flag & SD_HOLDOUT_MASK) - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - else - holdout_weight = shader_holdout_eval(kg, &sd); - - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - - if(sd.flag & SD_HOLDOUT_MASK) - break; - } -#endif - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); - -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); - } -#endif - - /* transparency termination */ - if(state.flag & PATH_RAY_TRANSPARENT) { - /* path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); - - if(probability == 0.0f) { - break; - } - else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); - - if(terminate >= probability) - break; - - throughput /= probability; - } - } - -#ifdef __AO__ - /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput); - } -#endif - -#ifdef __SUBSURFACE__ - /* bssrdf scatter to a different location on the same object */ - if(sd.flag & SD_BSSRDF) { - kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state, - rng, &ray, throughput); - } -#endif - - if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { - PathState hit_state = state; - -#ifdef __EMISSION__ - /* direct light */ - if(kernel_data.integrator.use_direct_light) { - bool all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, rng, - &sd, &hit_state, throughput, 1.0f, &L, all); - } -#endif - - /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, - &sd, throughput, 1.0f, &hit_state, &L); - - /* continue in case of transparency */ - throughput *= shader_bsdf_transparency(kg, &sd); - - if(is_zero(throughput)) - break; - } - - path_state_next(kg, &state, LABEL_TRANSPARENT); - ray.P = ray_offset(sd.P, -sd.Ng); - ray.t -= sd.ray_length; /* clipping works through transparent */ - - -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD.dx = -sd.dI.dx; - ray.dD.dy = -sd.dI.dy; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -#endif - } - - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - - kernel_write_light_passes(kg, buffer, &L, sample); - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif - - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); -} - -#endif - -ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray) -{ - float filter_u; - float filter_v; - - int num_samples = kernel_data.integrator.aa_samples; - - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); - - /* sample camera ray */ - - float lens_u = 0.0f, lens_v = 0.0f; - - if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); - - float time = 0.0f; - -#ifdef __CAMERA_MOTION__ - if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); -#endif - - camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); -} - ccl_device void kernel_path_trace(KernelGlobals *kg, ccl_global float *buffer, ccl_global uint *rng_state, int sample, int x, int y, int offset, int stride) @@ -1232,38 +702,5 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, path_rng_end(kg, rng_state, rng); } -#ifdef __BRANCHED_PATH__ -ccl_device void kernel_branched_path_trace(KernelGlobals *kg, - ccl_global float *buffer, ccl_global uint *rng_state, - int sample, int x, int y, int offset, int stride) -{ - /* buffer offset */ - int index = offset + x + y*stride; - int pass_stride = kernel_data.film.pass_stride; - - rng_state += index; - buffer += index*pass_stride; - - /* initialize random numbers and ray */ - RNG rng; - Ray ray; - - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); - - /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); - - path_rng_end(kg, rng_state, rng); -} -#endif - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h new file mode 100644 index 00000000000..b6d64985f6a --- /dev/null +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -0,0 +1,534 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput) +{ + int num_samples = kernel_data.integrator.ao_samples; + float num_samples_inv = 1.0f/num_samples; + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + float3 ao_alpha = shader_bsdf_alpha(kg, sd); + + for(int j = 0; j < num_samples; j++) { + float bsdf_u, bsdf_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float3 ao_D; + float ao_pdf; + + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray light_ray; + float3 ao_shadow; + + light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.D = ao_D; + light_ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + light_ray.time = ccl_fetch(sd, time); +#endif + light_ray.dP = ccl_fetch(sd, dP); + light_ray.dD = differential3_zero(); + + if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) + path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + } + } +} + + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, + RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust, + PathState *state, PathRadiance *L) +{ + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + RNG bsdf_rng = cmj_hash(*rng, i); + + for(int j = 0; j < num_samples; j++) { + PathState ps = *state; + float3 tp = throughput; + Ray bsdf_ray; + + if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray)) + continue; + + kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } +} + +#ifdef __SUBSURFACE__ +ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, + ShaderData *sd, + PathRadiance *L, + PathState *state, + RNG *rng, + Ray *ray, + float3 throughput) +{ + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(*rng, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = 0; j < num_samples; j++) { + ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); +#ifdef __VOLUME__ + Ray volume_ray = *ray; + bool need_update_volume_stack = kernel_data.integrator.use_volumes && + ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; +#endif + + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + PathState hit_state = *state; + + path_state_branch(&hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng); + volume_ray.D = normalize_len(P - volume_ray.P, + &volume_ray.t); + + kernel_volume_stack_update_for_subsurface( + kg, + &volume_ray, + hit_state.volume_stack); + + /* Move volume ray forward. */ + volume_ray.P = P; + } +#endif + +#ifdef __EMISSION__ + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &bssrdf_sd[hit], throughput, num_samples_inv, + &hit_state, L); + } + } + } +} +#endif + +ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +{ + /* initialize */ + PathRadiance L; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + float L_transparent = 0.0f; + + path_radiance_init(&L, kernel_data.film.use_light_pass); + + PathState state; + path_state_init(kg, &state, rng, sample, &ray); + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; + debug_data_init(&debug_data); +#endif + + /* Main Loop + * Here we only handle transparency intersections from the camera ray. + * Indirect bounces are handled in kernel_branched_path_surface_indirect_light(). + */ + for(;;) { + /* intersect scene */ + Intersection isect; + uint visibility = path_state_ray_visibility(kg, &state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + + if(kernel_data.bvh.have_curves) { + if(kernel_data.cam.resolution == 1) { + float3 pixdiff = ray.dD.dx + ray.dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init(rng, &state, 0x51633e2d); + } + + bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; + debug_data.num_ray_bounces++; +#endif + +#ifdef __VOLUME__ + /* volume attenuation, emission, scatter */ + if(state.volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = ray; + volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); + +#ifdef __VOLUME_DECOUPLED__ + /* decoupled ray marching only supported on CPU */ + + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + /* direct light sampling */ + if(volume_segment.closure_flag & SD_SCATTER) { + volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + + bool all = kernel_data.integrator.sample_all_lights_direct; + + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, all, &volume_ray, &volume_segment); + + /* indirect light sampling */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + for(int j = 0; j < num_samples; j++) { + /* workaround to fix correlation bug in T38710, can find better solution + * in random number generator later, for now this is done here to not impact + * performance of rendering without volumes */ + RNG tmp_rng = cmj_hash(*rng, state.rng_offset); + + PathState ps = state; + Ray pray = ray; + float3 tp = throughput; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + /* scatter sample. if we use distance sampling and take just one + * sample for direct and indirect light, we could share this + * computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { + kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&L); + path_radiance_reset_indirect(&L); + } + } + } + + /* emission and transmittance */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); +#else + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + /* todo: we should cache the shader evaluations from stepping + * through the volume, for now we redo them multiple times */ + + for(int j = 0; j < num_samples; j++) { + PathState ps = state; + Ray pray = ray; + ShaderData volume_sd; + float3 tp = throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* todo: support equiangular, MIS and all light sampling. + * alternatively get decoupled ray marching working on the GPU */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { + kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&L); + path_radiance_reset_indirect(&L); + } + } +#endif + } + + /* todo: avoid this calculation using decoupled ray marching */ + kernel_volume_shadow(kg, &state, &volume_ray, &throughput); +#endif + } +#endif + + if(!hit) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent) { + L_transparent += average(throughput); + +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif + break; + } + +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, &state, &ray); + path_radiance_accum_background(&L, throughput, L_background, state.bounce); +#endif + + break; + } + + /* setup shading */ + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); + shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN); + shader_merge_closures(&sd); + + /* holdout */ +#ifdef __HOLDOUT__ + if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) { + if(kernel_data.background.transparent) { + float3 holdout_weight; + + if(sd.flag & SD_HOLDOUT_MASK) + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + else + holdout_weight = shader_holdout_eval(kg, &sd); + + /* any throughput is ok, should all be identical here */ + L_transparent += average(holdout_weight*throughput); + } + + if(sd.flag & SD_HOLDOUT_MASK) + break; + } +#endif + + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + +#ifdef __EMISSION__ + /* emission */ + if(sd.flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); + path_radiance_accum_emission(&L, throughput, emission, state.bounce); + } +#endif + + /* transparency termination */ + if(state.flag & PATH_RAY_TRANSPARENT) { + /* path termination. this is a strange place to put the termination, it's + * mainly due to the mixed in MIS that we use. gives too many unneeded + * shader evaluations, only need emission if we are going to terminate */ + float probability = path_state_terminate_probability(kg, &state, throughput); + + if(probability == 0.0f) { + break; + } + else if(probability != 1.0f) { + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + + if(terminate >= probability) + break; + + throughput /= probability; + } + } + +#ifdef __AO__ + /* ambient occlusion */ + if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { + kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput); + } +#endif + +#ifdef __SUBSURFACE__ + /* bssrdf scatter to a different location on the same object */ + if(sd.flag & SD_BSSRDF) { + kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state, + rng, &ray, throughput); + } +#endif + + if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { + PathState hit_state = state; + +#ifdef __EMISSION__ + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &sd, &hit_state, throughput, 1.0f, &L, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &sd, throughput, 1.0f, &hit_state, &L); + + /* continue in case of transparency */ + throughput *= shader_bsdf_transparency(kg, &sd); + + if(is_zero(throughput)) + break; + } + + /* Update Path State */ + state.flag |= PATH_RAY_TRANSPARENT; + state.transparent_bounce++; + + ray.P = ray_offset(sd.P, -sd.Ng); + ray.t -= sd.ray_length; /* clipping works through transparent */ + + +#ifdef __RAY_DIFFERENTIALS__ + ray.dP = sd.dP; + ray.dD.dx = -sd.dI.dx; + ray.dD.dy = -sd.dI.dy; +#endif + +#ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); +#endif + } + + float3 L_sum = path_radiance_clamp_and_sum(kg, &L); + + kernel_write_light_passes(kg, buffer, &L, sample); + +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); +#endif + + return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); +} + +ccl_device void kernel_branched_path_trace(KernelGlobals *kg, + ccl_global float *buffer, ccl_global uint *rng_state, + int sample, int x, int y, int offset, int stride) +{ + /* buffer offset */ + int index = offset + x + y*stride; + int pass_stride = kernel_data.film.pass_stride; + + rng_state += index; + buffer += index*pass_stride; + + /* initialize random numbers and ray */ + RNG rng; + Ray ray; + + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + + /* integrate */ + float4 L; + + if(ray.t != 0.0f) + L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); + else + L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + /* accumulate result in output buffer */ + kernel_write_pass_float4(buffer, sample, L); + + path_rng_end(kg, rng_state, rng); +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h new file mode 100644 index 00000000000..1912dfa16ed --- /dev/null +++ b/intern/cycles/kernel/kernel_path_common.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, + int x, int y, + ccl_addr_space RNG *rng, + ccl_addr_space Ray *ray) +{ + float filter_u; + float filter_v; + + int num_samples = kernel_data.integrator.aa_samples; + + path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + + /* sample camera ray */ + + float lens_u = 0.0f, lens_v = 0.0f; + + if(kernel_data.cam.aperturesize > 0.0f) + path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + + float time = 0.0f; + +#ifdef __CAMERA_MOTION__ + if(kernel_data.cam.shuttertime != -1.0f) + time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); +#endif + + camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index ab146c72cd0..15efb2371de 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray) +ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray) { state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; @@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG #endif } -ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label) +ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label) { /* ray through transparent keeps same flags from previous ray and is * not counted as a regular bounce, transparent has separate max */ @@ -106,7 +106,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP); } else if(label & LABEL_GLOSSY) { - state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR; + state->flag |= PATH_RAY_GLOSSY; state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP); } else { @@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput) +ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) { if(state->flag & PATH_RAY_TRANSPARENT) { /* transparent rays treated separately */ diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index f0d4e98c5e0..fe85a6b6e4b 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(sd->flag & SD_BSDF_HAS_EVAL)) + if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -32,7 +32,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif if(sample_all_lights) { @@ -53,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -85,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN light_t = 0.5f*light_t; LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -106,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); /* sample random light */ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { @@ -149,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); ray->D = bsdf_omega_in; ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = sd->time; + ray->time = ccl_fetch(sd, time); #endif #ifdef __VOLUME__ @@ -181,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, #endif +#ifndef __SPLIT_KERNEL__ /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng, + ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) return; /* sample illumination from lights to find path contribution */ @@ -199,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -216,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG } #endif } +#endif /* path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng, + ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(sd->flag & SD_BSDF) { + if(ccl_fetch(sd, flag) & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; @@ -254,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); ray->D = bsdf_omega_in; if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ + ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = bsdf_domega_in; #endif @@ -275,21 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, return true; } #ifdef __VOLUME__ - else if(sd->flag & SD_HAS_ONLY_VOLUME) { + else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { /* no surface shader but have a volume shader? act transparent */ /* update path state, count as transparent */ path_state_next(kg, state, LABEL_TRANSPARENT); if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ + ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(sd->P, -sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index bd18fd21354..62922df3286 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -163,6 +163,10 @@ ccl_device float3 mirrorball_to_direction(float u, float v) dir.x = 2.0f*u - 1.0f; dir.z = 2.0f*v - 1.0f; + + if(dir.x*dir.x + dir.z*dir.z > 1.0f) + return make_float3(0.0f, 0.0f, 0.0f); + dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f)); /* reflection */ @@ -191,6 +195,8 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v) switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range); + case PANORAMA_MIRRORBALL: + return mirrorball_to_direction(u, v); case PANORAMA_FISHEYE_EQUIDISTANT: return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov); case PANORAMA_FISHEYE_EQUISOLID: @@ -205,6 +211,8 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir) switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range); + case PANORAMA_MIRRORBALL: + return direction_to_mirrorball(dir); case PANORAMA_FISHEYE_EQUIDISTANT: return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov); case PANORAMA_FISHEYE_EQUISOLID: diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h new file mode 100644 index 00000000000..9e65e2b0768 --- /dev/null +++ b/intern/cycles/kernel/kernel_queues.h @@ -0,0 +1,132 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_QUEUE_H__ +#define __KERNEL_QUEUE_H__ + +/* + * Queue utility functions for split kernel + */ + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +/* + * Enqueue ray index into the queue + */ +ccl_device void enqueue_ray_index ( + int ray_index, /* Ray index to be enqueued */ + int queue_number, /* Queue in which the ray index should be enqueued*/ + ccl_global int *queues, /* Buffer of all queues */ + int queue_size, /* Size of each queue */ + ccl_global int *queue_index /* Array of size num_queues; Used for atomic increment */ + ) +{ + /* This thread's queue index */ + int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + queues[my_queue_index] = ray_index; +} + +/* + * Get the ray index for this thread + * Returns a positive ray_index for threads that have to do some work; + * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work + * i.e All ray's in the queue has been successfully allocated and there + * is no more ray to allocate to other threads. + */ +ccl_device int get_ray_index ( + int thread_index, /* Global thread index */ + int queue_number, /* Queue to operate on */ + ccl_global int *queues, /* Buffer of all queues */ + int queuesize, /* Size of a queue */ + int empty_queue /* Empty the queue slot as soon as we fetch the ray index */ + ) +{ + int ray_index = queues[queue_number * queuesize + thread_index]; + + if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) { + queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } + + return ray_index; +} + +/* The following functions are to realize Local memory variant of enqueue ray index function */ + +/* All threads should call this function */ +ccl_device void enqueue_ray_index_local( + int ray_index, /* Ray index to enqueue*/ + int queue_number, /* Queue in which to enqueue ray index */ + char enqueue_flag, /* True for threads whose ray index has to be enqueued */ + int queuesize, /* queue size */ + ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics */ + ccl_global int *Queue_data, /* Queues */ + ccl_global int *Queue_index /* To do global queue atomics */ + ) +{ + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + + /* Get local queue id */ + unsigned int lqidx; + if(enqueue_flag) { + lqidx = atomic_inc(local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Get global queue offset */ + if(lidx == 0) { + *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Get global queue index and enqueue ray */ + if(enqueue_flag) { + unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx; + Queue_data[my_gqidx] = ray_index; + } +} + +ccl_device unsigned int get_local_queue_index( + int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ + ccl_local unsigned int *local_queue_atomics + ) +{ + int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + return my_lqidx; +} + +ccl_device unsigned int get_global_per_queue_offset( + int queue_number, + ccl_local unsigned int *local_queue_atomics, + ccl_global int* global_queue_atomics + ) +{ + unsigned int queue_offset = atomic_add((&global_queue_atomics[queue_number]), local_queue_atomics[queue_number]); + return queue_offset; +} + +ccl_device unsigned int get_global_queue_index( + int queue_number, + int queuesize, + unsigned int lqidx, + ccl_local unsigned int * global_per_queue_offset + ) +{ + int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; + return my_gqidx; +} + +#endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 40767bac013..631a2cb75de 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int #endif } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int } } -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy) { #ifdef __SOBOL_FULL_SCREEN__ uint px, py; @@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed) * For branches in the path we must be careful not to reuse the same number * in a sequence and offset accordingly. */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) { return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) { /* the rng_offset is not increased for transparent bounces. if we do then * fully transparent objects can become subtly visible by the different @@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 0f3b09a9555..94e13028599 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -37,13 +37,13 @@ CCL_NAMESPACE_BEGIN #ifdef __OBJECT_MOTION__ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) { - if(sd->flag & SD_OBJECT_MOTION) { - sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); - sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); + if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) { + ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time); + ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm)); } else { - sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); } } #endif @@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce) { #ifdef __INSTANCING__ - sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif - sd->type = isect->type; - sd->flag = kernel_tex_fetch(__object_flag, sd->object); + ccl_fetch(sd, type) = isect->type; + ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - sd->time = ray->time; + ccl_fetch(sd, time) = ray->time; #endif - sd->prim = kernel_tex_fetch(__prim_index, isect->prim); - sd->ray_length = isect->t; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); + ccl_fetch(sd, ray_length) = isect->t; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; #ifdef __UV__ - sd->u = isect->u; - sd->v = isect->v; + ccl_fetch(sd, u) = isect->u; + ccl_fetch(sd, v) = isect->v; #endif #ifdef __HAIR__ - if(sd->type & PRIMITIVE_ALL_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - sd->shader = __float_as_int(curvedata.z); - sd->P = bvh_curve_refine(kg, sd, isect, ray); + ccl_fetch(sd, shader) = __float_as_int(curvedata.z); + ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); } else #endif - if(sd->type & PRIMITIVE_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); /* vectors */ - sd->P = triangle_refine(kg, sd, isect, ray); - sd->Ng = Ng; - sd->N = Ng; + ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, N) = Ng; /* smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) + ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); + triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); #endif } else { @@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, motion_triangle_shader_setup(kg, sd, isect, ray, false); } - sd->I = -ray->D; + ccl_fetch(sd, I) = -ray->D; - sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); + ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform(kg, sd, &sd->N); - object_normal_transform(kg, sd, &sd->Ng); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); #ifdef __DPDU__ - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); #endif } #endif /* backfacing test */ - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); if(backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; + ccl_fetch(sd, flag) |= SD_BACKFACING; + ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); + ccl_fetch(sd, N) = -ccl_fetch(sd, N); #ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; + ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); + ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); - differential_incoming(&sd->dI, ray->dD); - differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); + differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); + differential_incoming(&ccl_fetch(sd, dI), ray->dD); + differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); #endif } @@ -166,7 +166,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat /* fetch triangle data */ if(sd->type == PRIMITIVE_TRIANGLE) { float3 Ng = triangle_normal(kg, sd); - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* static triangle */ sd->P = triangle_refine_subsurface(kg, sd, isect, ray); @@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd, int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce) { /* vectors */ - sd->P = P; - sd->N = Ng; - sd->Ng = Ng; - sd->I = I; - sd->shader = shader; - sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE; + ccl_fetch(sd, P) = P; + ccl_fetch(sd, N) = Ng; + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, I) = I; + ccl_fetch(sd, shader) = shader; + ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE; /* primitive */ #ifdef __INSTANCING__ - sd->object = object; + ccl_fetch(sd, object) = object; #endif /* currently no access to bvh prim index for strand sd->prim*/ - sd->prim = prim; + ccl_fetch(sd, prim) = prim; #ifdef __UV__ - sd->u = u; - sd->v = v; + ccl_fetch(sd, u) = u; + ccl_fetch(sd, v) = v; #endif - sd->ray_length = t; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, ray_length) = t; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; /* detect instancing, for non-instanced the object index is -object-1 */ #ifdef __INSTANCING__ bool instanced = false; - if(sd->prim != PRIM_NONE) { - if(sd->object >= 0) + if(ccl_fetch(sd, prim) != PRIM_NONE) { + if(ccl_fetch(sd, object) >= 0) instanced = true; else #endif - sd->object = ~sd->object; + ccl_fetch(sd, object) = ~ccl_fetch(sd, object); #ifdef __INSTANCING__ } #endif - sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); - if(sd->object != OBJECT_NONE) { - sd->flag |= kernel_tex_fetch(__object_flag, sd->object); + ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); + if(ccl_fetch(sd, object) != OBJECT_NONE) { + ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); } - sd->time = time; + ccl_fetch(sd, time) = time; #else } #endif - if(sd->type & PRIMITIVE_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); #ifdef __INSTANCING__ if(instanced) - object_normal_transform(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); + triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); #ifdef __INSTANCING__ if(instanced) { - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); } #endif #endif } else { #ifdef __DPDU__ - sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); - sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(sd->prim != PRIM_NONE) { - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + if(ccl_fetch(sd, prim) != PRIM_NONE) { + bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); if(backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; + ccl_fetch(sd, flag) |= SD_BACKFACING; + ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); + ccl_fetch(sd, N) = -ccl_fetch(sd, N); #ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; + ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); + ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - sd->dP = differential3_zero(); - sd->dI = differential3_zero(); - sd->du = differential_zero(); - sd->dv = differential_zero(); + ccl_fetch(sd, dP) = differential3_zero(); + ccl_fetch(sd, dI) = differential3_zero(); + ccl_fetch(sd, du) = differential_zero(); + ccl_fetch(sd, dv) = differential_zero(); #endif } @@ -355,45 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce) { /* vectors */ - sd->P = ray->D; - sd->N = -ray->D; - sd->Ng = -ray->D; - sd->I = -ray->D; - sd->shader = kernel_data.background.surface_shader; - sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); + ccl_fetch(sd, P) = ray->D; + ccl_fetch(sd, N) = -ray->D; + ccl_fetch(sd, Ng) = -ray->D; + ccl_fetch(sd, I) = -ray->D; + ccl_fetch(sd, shader) = kernel_data.background.surface_shader; + ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); #ifdef __OBJECT_MOTION__ - sd->time = ray->time; + ccl_fetch(sd, time) = ray->time; #endif - sd->ray_length = 0.0f; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, ray_length) = 0.0f; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; #ifdef __INSTANCING__ - sd->object = PRIM_NONE; + ccl_fetch(sd, object) = PRIM_NONE; #endif - sd->prim = PRIM_NONE; + ccl_fetch(sd, prim) = PRIM_NONE; #ifdef __UV__ - sd->u = 0.0f; - sd->v = 0.0f; + ccl_fetch(sd, u) = 0.0f; + ccl_fetch(sd, v) = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); - sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - sd->dP = ray->dD; - differential_incoming(&sd->dI, sd->dP); - sd->du = differential_zero(); - sd->dv = differential_zero(); + ccl_fetch(sd, dP) = ray->dD; + differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); + ccl_fetch(sd, du) = differential_zero(); + ccl_fetch(sd, dv) = differential_zero(); #endif } /* ShaderData setup from point inside volume */ +#ifdef __VOLUME__ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce) { /* vectors */ @@ -439,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s sd->ray_P = ray->P; sd->ray_dP = ray->dP; } +#endif /* Merging */ @@ -478,6 +480,7 @@ ccl_device void shader_merge_closures(ShaderData *sd) } sd->num_closure--; + kernel_assert(sd->num_closure >= 0); j--; } } @@ -491,11 +494,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i< sd->num_closure; i++) { + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { if(i == skip_bsdf) continue; - const ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; @@ -513,7 +516,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa *pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f; } -ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd, +ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf) { bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass); @@ -527,22 +530,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, { int sampled = 0; - if(sd->num_closure > 1) { + if(ccl_fetch(sd, num_closure) > 1) { /* pick a BSDF closure based on sample weights */ float sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; + for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); if(CLOSURE_IS_BSDF(sc->type)) sum += sc->sample_weight; } - float r = sd->randb_closure*sum; + float r = ccl_fetch(sd, randb_closure)*sum; sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; + for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; @@ -552,13 +555,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, } } - if(sampled == sd->num_closure) { + if(sampled == ccl_fetch(sd, num_closure)) { *pdf = 0.0f; return LABEL_NONE; } } - const ShaderClosure *sc = &sd->closure[sampled]; + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + int label; float3 eval; @@ -568,7 +572,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(sd->num_closure > 1) { + if(ccl_fetch(sd, num_closure) > 1) { float sweight = sc->sample_weight; _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); } @@ -595,8 +599,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); @@ -605,13 +609,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { - if(sd->flag & SD_HAS_ONLY_VOLUME) + if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -634,8 +638,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -648,8 +652,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -662,8 +666,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -676,8 +680,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -691,8 +695,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { eval += sc->weight*ao_factor; @@ -700,12 +704,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += sd->N*average(sc->weight); + N += ccl_fetch(sd, N)*average(sc->weight); } } if(is_zero(N)) - N = sd->N; + N = ccl_fetch(sd, N); else N = normalize(N); @@ -719,8 +723,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSSRDF(sc->type)) { float avg_weight = fabsf(average(sc->weight)); @@ -733,7 +737,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? sd->N: normalize(N); + *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); if(texture_blur_) *texture_blur_ = texture_blur/weight_sum; @@ -745,7 +749,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) { - return emissive_simple_eval(sd->Ng, sd->I); + return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); } ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) @@ -753,8 +757,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) float3 eval; eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_EMISSION(sc->type)) eval += emissive_eval(kg, sd, sc)*sc->weight; @@ -769,8 +773,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -784,8 +788,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, float randb, int path_flag, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = randb; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = randb; #ifdef __OSL__ if(kg->osl) @@ -796,9 +800,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, #ifdef __SVM__ svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag); #else - sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f); - sd->closure->N = sd->N; - sd->flag |= bsdf_diffuse_setup(&sd->closure); + ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f); + ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N); + ccl_fetch_array(sd, closure, 0)->data0 = 0.0f; + ccl_fetch_array(sd, closure, 0)->data1 = 0.0f; + ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0)); #endif } } @@ -807,8 +813,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = 0.0f; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = 0.0f; #ifdef __OSL__ if(kg->osl) { @@ -823,8 +829,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BACKGROUND(sc->type)) eval += sc->weight; @@ -844,7 +850,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, const float3 omega_in, float *pdf, int skip_phase, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) { - for(int i = 0; i< sd->num_closure; i++) { + for(int i = 0; i < sd->num_closure; i++) { if(i == skip_phase) continue; @@ -997,8 +1003,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = 0.0f; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h new file mode 100644 index 00000000000..b157b82e023 --- /dev/null +++ b/intern/cycles/kernel/kernel_shaderdata_vars.h @@ -0,0 +1,99 @@ +/* +* Copyright 2011-2015 Blender Foundation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef SD_VAR +#define SD_VAR(type, what) +#endif +#ifndef SD_CLOSURE_VAR +#define SD_CLOSURE_VAR(type, what, max_closure) +#endif + +/* position */ +SD_VAR(float3, P) +/* smooth normal for shading */ +SD_VAR(float3, N) +/* true geometric normal */ +SD_VAR(float3, Ng) +/* view/incoming direction */ +SD_VAR(float3, I) +/* shader id */ +SD_VAR(int, shader) +/* booleans describing shader, see ShaderDataFlag */ +SD_VAR(int, flag) + +/* primitive id if there is one, ~0 otherwise */ +SD_VAR(int, prim) + +/* combined type and curve segment for hair */ +SD_VAR(int, type) + +/* parametric coordinates +* - barycentric weights for triangles */ +SD_VAR(float, u) +SD_VAR(float, v) +/* object id if there is one, ~0 otherwise */ +SD_VAR(int, object) + +/* motion blur sample time */ +SD_VAR(float, time) + +/* length of the ray being shaded */ +SD_VAR(float, ray_length) + +/* ray bounce depth */ +SD_VAR(int, ray_depth) + +/* ray transparent depth */ +SD_VAR(int, transparent_depth) + +#ifdef __RAY_DIFFERENTIALS__ +/* differential of P. these are orthogonal to Ng, not N */ +SD_VAR(differential3, dP) +/* differential of I */ +SD_VAR(differential3, dI) +/* differential of u, v */ +SD_VAR(differential, du) +SD_VAR(differential, dv) +#endif +#ifdef __DPDU__ +/* differential of P w.r.t. parametric coordinates. note that dPdu is +* not readily suitable as a tangent for shading on triangles. */ +SD_VAR(float3, dPdu) +SD_VAR(float3, dPdv) +#endif + +#ifdef __OBJECT_MOTION__ +/* object <-> world space transformations, cached to avoid +* re-interpolating them constantly for shading */ +SD_VAR(Transform, ob_tfm) +SD_VAR(Transform, ob_itfm) +#endif + +/* Closure data, we store a fixed array of closures */ +SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE) +SD_VAR(int, num_closure) +SD_VAR(float, randb_closure) + +/* ray start position, only set for backgrounds */ +SD_VAR(float3, ray_P) +SD_VAR(differential3, ray_dP) + +#ifdef __OSL__ +SD_VAR(struct KernelGlobals *, osl_globals) +#endif + +#undef SD_VAR +#undef SD_CLOSURE_VAR diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 8923fcebee5..2811a8348ca 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -39,19 +39,6 @@ CCL_NAMESPACE_BEGIN * This is CPU only because of qsort, and malloc or high stack space usage to * record all these intersections. */ -ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b) -{ - const Intersection *isect_a = (const Intersection*)a; - const Intersection *isect_b = (const Intersection*)b; - - if(isect_a->t < isect_b->t) - return -1; - else if(isect_a->t > isect_b->t) - return 1; - else - return 0; -} - #define STACK_MAX_HITS 64 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow) @@ -95,7 +82,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * PathState ps = *state; #endif - qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare); + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); for(int hit = 0; hit < num_hits; hit++, isect++) { /* adjust intersection distance for moving ray forward */ @@ -193,19 +180,36 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * * potentially transparent, and only in that case start marching. this gives * one extra ray cast for the cases were we do want transparency. */ -ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow) +ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow +#ifdef __SPLIT_KERNEL__ + , ShaderData *sd_mem, Intersection *isect_mem +#endif + ) { *shadow = make_float3(1.0f, 1.0f, 1.0f); - if(ray->t == 0.0f) + if(ray_input->t == 0.0f) return false; - Intersection isect; - bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); +#ifdef __SPLIT_KERNEL__ + Ray private_ray = *ray_input; + Ray *ray = &private_ray; +#else + Ray *ray = ray_input; +#endif + +#ifdef __SPLIT_KERNEL__ + Intersection *isect = isect_mem; +#else + Intersection isect_object; + Intersection *isect = &isect_object; +#endif + + bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); #ifdef __TRANSPARENT_SHADOWS__ if(blocked && kernel_data.integrator.transparent_shadows) { - if(shader_transparent_shadow(kg, &isect)) { + if(shader_transparent_shadow(kg, isect)) { float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float3 Pend = ray->P + ray->D*ray->t; int bounce = state->transparent_bounce; @@ -217,9 +221,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * if(bounce >= kernel_data.integrator.transparent_max_bounce) return true; - if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) + if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) { - #ifdef __VOLUME__ /* attenuation for last line segment towards light */ if(ps.volume_stack[0].shader != SHADER_NONE) @@ -231,39 +234,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return false; } - if(!shader_transparent_shadow(kg, &isect)) + if(!shader_transparent_shadow(kg, isect)) return true; #ifdef __VOLUME__ /* attenuation between last surface and next surface */ if(ps.volume_stack[0].shader != SHADER_NONE) { Ray segment_ray = *ray; - segment_ray.t = isect.t; + segment_ray.t = isect->t; kernel_volume_shadow(kg, &ps, &segment_ray, &throughput); } #endif /* setup shader data at surface */ - ShaderData sd; - shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce); +#ifdef __SPLIT_KERNEL__ + ShaderData *sd = sd_mem; +#else + ShaderData sd_object; + ShaderData *sd = &sd_object; +#endif + shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce); /* attenuation from transparent surface */ - if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { - shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - throughput *= shader_bsdf_transparency(kg, &sd); + if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) { + shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + throughput *= shader_bsdf_transparency(kg, sd); } if(is_zero(throughput)) return true; /* move ray forward */ - ray->P = ray_offset(sd.P, -sd.Ng); + ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); if(ray->t != FLT_MAX) ray->D = normalize_len(Pend - ray->P, &ray->t); #ifdef __VOLUME__ /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack); + kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack); #endif bounce++; diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 374dc6d1dd9..f545a056cc8 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -24,6 +24,7 @@ /* bvh */ KERNEL_TEX(float4, texture_float4, __bvh_nodes) +KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) KERNEL_TEX(float4, texture_float4, __tri_woop) KERNEL_TEX(uint, texture_uint, __prim_type) KERNEL_TEX(uint, texture_uint, __prim_visibility) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 238b4b0bfdc..2a70bfcb8f0 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -24,6 +24,13 @@ #define __KERNEL_CPU__ #endif +/* TODO(sergey): This is only to make it possible to include this header + * from outside of the kernel. but this could be done somewhat cleaner? + */ +#ifndef ccl_addr_space +#define ccl_addr_space +#endif + CCL_NAMESPACE_BEGIN /* constants */ @@ -38,12 +45,6 @@ CCL_NAMESPACE_BEGIN #define BSSRDF_MIN_RADIUS 1e-8f #define BSSRDF_MAX_HITS 4 -#define BB_DRAPER 800.0f -#define BB_MAX_TABLE_RANGE 12000.0f -#define BB_TABLE_XPOWER 1.5f -#define BB_TABLE_YPOWER 5.0f -#define BB_TABLE_SPACING 2.0f - #define BECKMANN_TABLE_SIZE 256 #define TEX_NUM_FLOAT_IMAGES 5 @@ -72,6 +73,7 @@ CCL_NAMESPACE_BEGIN #define __VOLUME_DECOUPLED__ #define __VOLUME_SCATTER__ #define __SHADOW_RECORD_ALL__ +#define __VOLUME_RECORD_ALL__ #endif #ifdef __KERNEL_CUDA__ @@ -82,7 +84,7 @@ CCL_NAMESPACE_BEGIN #define __VOLUME_SCATTER__ /* Experimental on GPU */ -#ifdef __KERNEL_CUDA_EXPERIMENTAL__ +#ifdef __KERNEL_EXPERIMENTAL__ #define __SUBSURFACE__ #define __CMJ__ #endif @@ -94,38 +96,44 @@ CCL_NAMESPACE_BEGIN /* keep __KERNEL_ADV_SHADING__ in sync with opencl_kernel_use_advanced_shading! */ #ifdef __KERNEL_OPENCL_NVIDIA__ -#define __KERNEL_SHADING__ -#define __KERNEL_ADV_SHADING__ +# define __KERNEL_SHADING__ +# define __KERNEL_ADV_SHADING__ +# ifdef __KERNEL_EXPERIMENTAL__ +# define __CMJ__ +# endif #endif #ifdef __KERNEL_OPENCL_APPLE__ -#define __KERNEL_SHADING__ +# define __KERNEL_SHADING__ //#define __KERNEL_ADV_SHADING__ #endif #ifdef __KERNEL_OPENCL_AMD__ -#define __CL_USE_NATIVE__ -#define __KERNEL_SHADING__ -//__KERNEL_ADV_SHADING__ -#define __MULTI_CLOSURE__ -#define __TRANSPARENT_SHADOWS__ -#define __PASSES__ -#define __BACKGROUND_MIS__ -#define __LAMP_MIS__ -#define __AO__ -//#define __CAMERA_MOTION__ -//#define __OBJECT_MOTION__ -//#define __HAIR__ -//end __KERNEL_ADV_SHADING__ +# define __CL_USE_NATIVE__ +# define __KERNEL_SHADING__ +# define __MULTI_CLOSURE__ +# define __PASSES__ +# define __BACKGROUND_MIS__ +# define __LAMP_MIS__ +# define __AO__ +# define __CAMERA_MOTION__ +# define __OBJECT_MOTION__ +# define __HAIR__ +# ifdef __KERNEL_EXPERIMENTAL__ +# define __TRANSPARENT_SHADOWS__ +# endif #endif #ifdef __KERNEL_OPENCL_INTEL_CPU__ -#define __CL_USE_NATIVE__ -#define __KERNEL_SHADING__ -#define __KERNEL_ADV_SHADING__ +# define __CL_USE_NATIVE__ +# define __KERNEL_SHADING__ +# define __KERNEL_ADV_SHADING__ +# ifdef __KERNEL_EXPERIMENTAL__ +# define __CMJ__ +# endif #endif -#endif +#endif // __KERNEL_OPENCL__ /* kernel features */ #define __SOBOL__ @@ -164,6 +172,17 @@ CCL_NAMESPACE_BEGIN # define __KERNEL_DEBUG__ #endif +/* Scene-based selective featrues compilation/ */ +#ifdef __NO_CAMERA_MOTION__ +# undef __CAMERA_MOTION__ +#endif +#ifdef __NO_OBJECT_MOTION__ +# undef __OBJECT_MOTION__ +#endif +#ifdef __NO_HAIR__ +# undef __HAIR__ +#endif + /* Random Numbers */ typedef uint RNG; @@ -269,9 +288,7 @@ enum PathRayFlag { PATH_RAY_MIS_SKIP = 2048, PATH_RAY_DIFFUSE_ANCESTOR = 4096, - PATH_RAY_GLOSSY_ANCESTOR = 8192, - PATH_RAY_BSSRDF_ANCESTOR = 16384, - PATH_RAY_SINGLE_PASS_DONE = 32768, + PATH_RAY_SINGLE_PASS_DONE = 8192, /* we need layer member flags to be the 20 upper bits */ PATH_RAY_LAYER_SHIFT = (32-20) @@ -322,6 +339,8 @@ typedef enum PassType { PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */ #ifdef __KERNEL_DEBUG__ PASS_BVH_TRAVERSAL_STEPS = (1 << 26), + PASS_BVH_TRAVERSED_INSTANCES = (1 << 27), + PASS_RAY_BOUNCES = (1 << 28), #endif } PassType; @@ -329,7 +348,7 @@ typedef enum PassType { #ifdef __PASSES__ -typedef struct PathRadiance { +typedef ccl_addr_space struct PathRadiance { int use_light_pass; float3 emission; @@ -381,7 +400,7 @@ typedef struct BsdfEval { #else -typedef float3 PathRadiance; +typedef ccl_addr_space float3 PathRadiance; typedef float3 BsdfEval; #endif @@ -426,6 +445,7 @@ enum CameraType { enum PanoramaType { PANORAMA_EQUIRECTANGULAR, + PANORAMA_MIRRORBALL, PANORAMA_FISHEYE_EQUIDISTANT, PANORAMA_FISHEYE_EQUISOLID }; @@ -445,10 +465,26 @@ typedef struct differential { /* Ray */ typedef struct Ray { +/* TODO(sergey): This is only needed because current AMD + * compiler has hard time building the kernel with this + * reshuffle. And at the same time reshuffle will cause + * less optimal CPU code in certain places. + * + * We'll get rid of this nasty exception once AMD compiler + * is fixed. + */ +#ifndef __KERNEL_OPENCL_AMD__ float3 P; /* origin */ float3 D; /* direction */ + float t; /* length of the ray */ float time; /* time (for motion blur) */ +#else + float t; /* length of the ray */ + float time; /* time (for motion blur) */ + float3 P; /* origin */ + float3 D; /* direction */ +#endif #ifdef __RAY_DIFFERENTIALS__ differential3 dP; @@ -458,7 +494,7 @@ typedef struct Ray { /* Intersection */ -typedef struct Intersection { +typedef ccl_addr_space struct Intersection { float t, u, v; int prim; int object; @@ -466,6 +502,7 @@ typedef struct Intersection { #ifdef __KERNEL_DEBUG__ int num_traversal_steps; + int num_traversed_instances; #endif } Intersection; @@ -543,7 +580,11 @@ typedef enum AttributeStandard { /* Closure data */ #ifdef __MULTI_CLOSURE__ -#define MAX_CLOSURE 64 +# ifndef __MAX_CLOSURE__ +# define MAX_CLOSURE 64 +# else +# define MAX_CLOSURE __MAX_CLOSURE__ +# endif #else #define MAX_CLOSURE 1 #endif @@ -553,7 +594,7 @@ typedef enum AttributeStandard { * does not put own padding trying to align this members. * - We make sure OSL pointer is also 16 bytes aligned. */ -typedef struct ShaderClosure { +typedef ccl_addr_space struct ShaderClosure { float3 weight; float3 N; float3 T; @@ -638,78 +679,23 @@ enum ShaderDataFlag { struct KernelGlobals; -typedef struct ShaderData { - /* position */ - float3 P; - /* smooth normal for shading */ - float3 N; - /* true geometric normal */ - float3 Ng; - /* view/incoming direction */ - float3 I; - /* shader id */ - int shader; - /* booleans describing shader, see ShaderDataFlag */ - int flag; - - /* primitive id if there is one, ~0 otherwise */ - int prim; - - /* combined type and curve segment for hair */ - int type; - - /* parametric coordinates - * - barycentric weights for triangles */ - float u, v; - /* object id if there is one, ~0 otherwise */ - int object; - - /* motion blur sample time */ - float time; - - /* length of the ray being shaded */ - float ray_length; - - /* ray bounce depth */ - int ray_depth; - - /* ray transparent depth */ - int transparent_depth; - -#ifdef __RAY_DIFFERENTIALS__ - /* differential of P. these are orthogonal to Ng, not N */ - differential3 dP; - /* differential of I */ - differential3 dI; - /* differential of u, v */ - differential du; - differential dv; -#endif -#ifdef __DPDU__ - /* differential of P w.r.t. parametric coordinates. note that dPdu is - * not readily suitable as a tangent for shading on triangles. */ - float3 dPdu, dPdv; -#endif - -#ifdef __OBJECT_MOTION__ - /* object <-> world space transformations, cached to avoid - * re-interpolating them constantly for shading */ - Transform ob_tfm; - Transform ob_itfm; +#ifdef __SPLIT_KERNEL__ +#define SD_VAR(type, what) ccl_global type *what; +#define SD_CLOSURE_VAR(type, what, max_closure) type *what; +#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0)) +#define ccl_fetch(s, t) (s->t[TIDX]) +#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index]) +#else +#define SD_VAR(type, what) type what; +#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure]; +#define ccl_fetch(s, t) (s->t) +#define ccl_fetch_array(s, t, index) (&s->t[index]) #endif - /* Closure data, we store a fixed array of closures */ - ShaderClosure closure[MAX_CLOSURE]; - int num_closure; - float randb_closure; +typedef ccl_addr_space struct ShaderData { - /* ray start position, only set for backgrounds */ - float3 ray_P; - differential3 ray_dP; +#include "kernel_shaderdata_vars.h" -#ifdef __OSL__ - struct KernelGlobals *osl_globals; -#endif } ShaderData; /* Path State */ @@ -867,7 +853,9 @@ typedef struct KernelFilm { #ifdef __KERNEL_DEBUG__ int pass_bvh_traversal_steps; - int pass_pad3, pass_pad4, pass_pad5; + int pass_bvh_traversed_instances; + int pass_ray_bounces; + int pass_pad3; #endif } KernelFilm; @@ -895,6 +883,11 @@ typedef struct KernelIntegrator { float inv_pdf_lights; int pdf_background_res; + /* light portals */ + float portal_pdf; + int num_portals; + int portal_offset; + /* bounces */ int min_bounce; int max_bounce; @@ -947,6 +940,8 @@ typedef struct KernelIntegrator { int volume_max_steps; float volume_step_size; int volume_samples; + + int pad; } KernelIntegrator; typedef struct KernelBVH { @@ -980,9 +975,8 @@ typedef struct KernelCurves { } KernelCurves; typedef struct KernelTables { - int blackbody_offset; int beckmann_offset; - int pad1, pad2; + int pad1, pad2, pad3; } KernelTables; typedef struct KernelData { @@ -996,13 +990,64 @@ typedef struct KernelData { } KernelData; #ifdef __KERNEL_DEBUG__ -typedef struct DebugData { +typedef ccl_addr_space struct DebugData { // Total number of BVH node traversal steps and primitives intersections // for the camera rays. int num_bvh_traversal_steps; + int num_bvh_traversed_instances; + int num_ray_bounces; } DebugData; #endif +/* Declarations required for split kernel */ + +/* Macro for queues */ +/* Value marking queue's empty slot */ +#define QUEUE_EMPTY_SLOT -1 + +/* +* Queue 1 - Active rays +* Queue 2 - Background queue +* Queue 3 - Shadow ray cast kernel - AO +* Queeu 4 - Shadow ray cast kernel - direct lighting +*/ +#define NUM_QUEUES 4 + +/* Queue names */ +enum QueueNumber { + QUEUE_ACTIVE_AND_REGENERATED_RAYS, /* All active rays and regenerated rays are enqueued here */ + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All + * 1.Background-hit rays, + * 2.Rays that has exited path-iteration but needs to update output buffer + * 3.Rays to be regenerated + * are enqueued here */ + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance + contribution for AO are enqueued here */ + QUEUE_SHADOW_RAY_CAST_DL_RAYS, /* All rays for which a shadow ray should be cast to determine radiance + contributuin for direct lighting are enqueued here */ +}; + +/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ +#define RAY_STATE_MASK 0x007 +#define RAY_FLAG_MASK 0x0F8 +enum RayState { + RAY_ACTIVE = 0, // Denotes ray is actively involved in path-iteration + RAY_INACTIVE = 1, // Denotes ray has completed processing all samples and is inactive + RAY_UPDATE_BUFFER = 2, // Denoted ray has exited path-iteration and needs to update output buffer + RAY_HIT_BACKGROUND = 3, // Donotes ray has hit background + RAY_TO_REGENERATE = 4, // Denotes ray has to be regenerated + RAY_REGENERATED = 5, // Denotes ray has been regenerated + RAY_SKIP_DL = 6, // Denotes ray should skip direct lighting + RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part + RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part +}; + +#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) +#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) +#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) +#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 0300e1d4c7f..e06568457c6 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -627,7 +627,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta step_size = kernel_data.integrator.volume_step_size; /* compute exact steps in advance for malloc */ max_steps = max((int)ceilf(ray->t/step_size), 1); - if (max_steps > global_max_steps) { + if(max_steps > global_max_steps) { max_steps = global_max_steps; step_size = ray->t / (float)max_steps; } @@ -993,6 +993,48 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg, volume_ray.t = FLT_MAX; int stack_index = 0, enclosed_index = 0; + +#ifdef __VOLUME_RECORD_ALL__ + Intersection hits[2*VOLUME_STACK_SIZE]; + uint num_hits = scene_intersect_volume_all(kg, + &volume_ray, + hits, + 2*VOLUME_STACK_SIZE); + if(num_hits > 0) { + int enclosed_volumes[VOLUME_STACK_SIZE]; + Intersection *isect = hits; + + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); + + for(uint hit = 0; hit < num_hits; ++hit, ++isect) { + ShaderData sd; + shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0); + if(sd.flag & SD_BACKFACING) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + bool is_enclosed = false; + for(int i = 0; i < enclosed_index; ++i) { + if(enclosed_volumes[i] == sd.object) { + is_enclosed = true; + break; + } + } + if(is_enclosed == false) { + stack[stack_index].object = sd.object; + stack[stack_index].shader = sd.shader; + ++stack_index; + } + } + else { + /* If ray from camera enters the volume, this volume shouldn't + * be added to the stack on exit. + */ + enclosed_volumes[enclosed_index++] = sd.object; + } + } + } +#else int enclosed_volumes[VOLUME_STACK_SIZE]; int step = 0; @@ -1035,6 +1077,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg, volume_ray.P = ray_offset(sd.P, -sd.Ng); ++step; } +#endif /* stack_index of 0 means quick checks outside of the kernel gave false * positive, nothing to worry about, just we've wasted quite a few of * ticks just to come into conclusion that camera is in the air. @@ -1097,4 +1140,49 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd } } +#ifdef __SUBSURFACE__ +ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, + Ray *ray, + VolumeStack *stack) +{ + kernel_assert(kernel_data.integrator.use_volumes); + + Ray volume_ray = *ray; + +#ifdef __VOLUME_RECORD_ALL__ + Intersection hits[2*VOLUME_STACK_SIZE]; + uint num_hits = scene_intersect_volume_all(kg, + &volume_ray, + hits, + 2*VOLUME_STACK_SIZE); + if(num_hits > 0) { + Intersection *isect = hits; + + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); + + for(uint hit = 0; hit < num_hits; ++hit, ++isect) { + ShaderData sd; + shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0); + kernel_volume_stack_enter_exit(kg, &sd, stack); + } + } +#else + Intersection isect; + int step = 0; + while(step < 2 * VOLUME_STACK_SIZE && + scene_intersect_volume(kg, &volume_ray, &isect)) + { + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); + kernel_volume_stack_enter_exit(kg, &sd, stack); + + /* Move ray forward. */ + volume_ray.P = ray_offset(sd.P, -sd.Ng); + volume_ray.t -= sd.ray_length; + ++step; + } +#endif +} +#endif + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h new file mode 100644 index 00000000000..9b83d972e97 --- /dev/null +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -0,0 +1,193 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_WORK_STEALING_H__ +#define __KERNEL_WORK_STEALING_H__ + +/* + * Utility functions for work stealing + */ + +#ifdef __WORK_STEALING__ + +#ifdef __KERNEL_OPENCL__ +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#endif + +uint get_group_id_with_ray_index(uint ray_index, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + int dim) +{ + if(dim == 0) { + uint x_span = ray_index % (tile_dim_x * parallel_samples); + return x_span / get_local_size(0); + } + else /*if(dim == 1)*/ { + kernel_assert(dim == 1); + uint y_span = ray_index / (tile_dim_x * parallel_samples); + return y_span / get_local_size(1); + } +} + +uint get_total_work(uint tile_dim_x, + uint tile_dim_y, + uint grp_idx, + uint grp_idy, + uint num_samples) +{ + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return threads_within_tile_border_x * + threads_within_tile_border_y * + num_samples; +} + +/* Returns 0 in case there is no next work available */ +/* Returns 1 in case work assigned is valid */ +int get_next_work(ccl_global uint *work_pool, + ccl_private uint *my_work, + uint tile_dim_x, + uint tile_dim_y, + uint num_samples, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint total_work = get_total_work(tile_dim_x, + tile_dim_y, + grp_idx, + grp_idy, + num_samples); + uint group_index = grp_idy * get_num_groups(0) + grp_idx; + *my_work = atomic_inc(&work_pool[group_index]); + return (*my_work < total_work) ? 1 : 0; +} + +/* This function assumes that the passed my_work is valid. */ +/* Decode sample number w.r.t. assigned my_work. */ +uint get_my_sample(uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return my_work / + (threads_within_tile_border_x * threads_within_tile_border_y); +} + +/* Decode pixel and tile position w.r.t. assigned my_work. */ +void get_pixel_tile_position(ccl_private uint *pixel_x, + ccl_private uint *pixel_y, + ccl_private uint *tile_x, + ccl_private uint *tile_y, + uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint tile_offset_x, + uint tile_offset_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + uint total_associated_pixels = + threads_within_tile_border_x * threads_within_tile_border_y; + uint work_group_pixel_index = my_work % total_associated_pixels; + uint work_group_pixel_x = + work_group_pixel_index % threads_within_tile_border_x; + uint work_group_pixel_y = + work_group_pixel_index / threads_within_tile_border_x; + + *pixel_x = + tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; + *pixel_y = + tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; + *tile_x = *pixel_x - tile_offset_x; + *tile_y = *pixel_y - tile_offset_y; +} + +#endif /* __WORK_STEALING__ */ + +#endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 013eeff57fa..37a73ab2f04 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -23,6 +23,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -55,7 +56,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t int id = atoi(name + strlen("__tex_image_float_")); int array_index = id; - if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) { + if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) { tex = &kg->texture_float_images[array_index]; } @@ -70,7 +71,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t int id = atoi(name + strlen("__tex_image_")); int array_index = id - MAX_FLOAT_IMAGES; - if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) { + if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) { tex = &kg->texture_byte_images[array_index]; } diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index f1027ad413d..df77bedc729 100644 --- a/intern/cycles/kernel/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -38,6 +38,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index b2f16ff54d8..b3192369794 100644 --- a/intern/cycles/kernel/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -39,6 +39,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index cc8c603e8f8..f9c5134e442 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -34,6 +34,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index 20919a4f26e..2dbe4b81821 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -36,6 +36,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index 48579d3b7e5..5c57ad01181 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -37,6 +37,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" +#include "kernel_path_branched.h" #include "kernel_bake.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 64069fc049f..bcd55b8c676 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,13 +16,14 @@ /* CUDA kernel entry points */ -#include "kernel_compat_cuda.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_bake.h" +#include "../../kernel_compat_cuda.h" +#include "../../kernel_math.h" +#include "../../kernel_types.h" +#include "../../kernel_globals.h" +#include "../../kernel_film.h" +#include "../../kernel_path.h" +#include "../../kernel_path_branched.h" +#include "../../kernel_bake.h" /* device data taken from CUDA occupancy calculator */ diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 5a47260a4ee..15fb34cfe3b 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -16,14 +16,17 @@ /* OpenCL kernel entry points - unfinished */ -#include "kernel_compat_opencl.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "../../kernel_compat_opencl.h" +#include "../../kernel_math.h" +#include "../../kernel_types.h" +#include "../../kernel_globals.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_bake.h" +#include "../../kernel_film.h" +#include "../../kernel_path.h" +#include "../../kernel_path_branched.h" +#include "../../kernel_bake.h" + +#ifdef __COMPILE_ONLY_MEGAKERNEL__ __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, @@ -32,7 +35,7 @@ __kernel void kernel_ocl_path_trace( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "kernel_textures.h" +#include "../../kernel_textures.h" int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -43,7 +46,7 @@ __kernel void kernel_ocl_path_trace( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "kernel_textures.h" +#include "../../kernel_textures.h" int x = sx + get_global_id(0); int y = sy + get_global_id(1); @@ -52,17 +55,18 @@ __kernel void kernel_ocl_path_trace( kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -__kernel void kernel_ocl_convert_to_byte( +#else // __COMPILE_ONLY_MEGAKERNEL__ + +__kernel void kernel_ocl_shader( ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, + ccl_global uint4 *input, + ccl_global float4 *output, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "kernel_textures.h" +#include "../../kernel_textures.h" - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -70,26 +74,24 @@ __kernel void kernel_ocl_convert_to_byte( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "kernel_textures.h" +#include "../../kernel_textures.h" int x = sx + get_global_id(0); - int y = sy + get_global_id(1); - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); + if(x < sx + sw) + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); } -__kernel void kernel_ocl_convert_to_half_float( +__kernel void kernel_ocl_bake( ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, + ccl_global uint4 *input, + ccl_global float4 *output, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "kernel_textures.h" +#include "../../kernel_textures.h" - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -97,25 +99,36 @@ __kernel void kernel_ocl_convert_to_half_float( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "kernel_textures.h" +#include "../../kernel_textures.h" int x = sx + get_global_id(0); - int y = sy + get_global_id(1); - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); + if(x < sx + sw) { +#if defined(__KERNEL_OPENCL_NVIDIA__) && __COMPUTE_CAPABILITY__ < 300 + /* NVidia compiler is spending infinite amount of time trying + * to deal with kernel_bake_evaluate() on architectures prior + * to sm_30. + * For now we disable baking kernel for those devices, so at + * least rendering with split kernel could be compiled. + */ + output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +#else + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); +#endif + } } -__kernel void kernel_ocl_shader( +__kernel void kernel_ocl_convert_to_byte( ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, + ccl_global uchar4 *rgba, + ccl_global float *buffer, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "kernel_textures.h" +#include "../../kernel_textures.h" - int type, int sx, int sw, int offset, int sample) + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) { KernelGlobals kglobals, *kg = &kglobals; @@ -123,24 +136,26 @@ __kernel void kernel_ocl_shader( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "kernel_textures.h" +#include "../../kernel_textures.h" int x = sx + get_global_id(0); + int y = sy + get_global_id(1); - if(x < sx + sw) - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); } -__kernel void kernel_ocl_bake( +__kernel void kernel_ocl_convert_to_half_float( ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, + ccl_global uchar4 *rgba, + ccl_global float *buffer, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "kernel_textures.h" +#include "../../kernel_textures.h" - int type, int sx, int sw, int offset, int sample) + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) { KernelGlobals kglobals, *kg = &kglobals; @@ -148,11 +163,13 @@ __kernel void kernel_ocl_bake( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "kernel_textures.h" +#include "../../kernel_textures.h" int x = sx + get_global_id(0); + int y = sy + get_global_id(1); - if(x < sx + sw) - kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +#endif // __COMPILE_ONLY_MEGAKERNEL__
\ No newline at end of file diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl new file mode 100644 index 00000000000..eff77b89a0a --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl @@ -0,0 +1,128 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_background_buffer_update.h" + +__kernel void kernel_ocl_path_trace_background_buffer_update( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* Required for buffer Update */ + ccl_global float3 *throughput_coop, /* Required for background hit processing */ + PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ + ccl_global Ray *Ray_coop, /* Required for background hit processing */ + ccl_global PathState *PathState_coop, /* Required for background hit processing */ + ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ + ccl_global char *ray_state, /* Stores information on the current state of a ray */ + int sw, int sh, int sx, int sy, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global unsigned int *work_array, /* Denotes work of each ray */ + ccl_global int *Queue_data, /* Queues memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int end_sample, + int start_sample, +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + Queue_data, + queuesize, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = + kernel_background_buffer_update(globals, + data, + shader_data, + per_sample_output_buffers, + rng_state, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + L_transparent_coop, + ray_state, + sw, sh, sx, sy, stride, + rng_state_offset_x, + rng_state_offset_y, + rng_state_stride, + work_array, + end_sample, + start_sample, +#ifdef __WORK_STEALING__ + work_pool_wgs, + num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + debugdata_coop, +#endif + parallel_samples, + ray_index); +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl new file mode 100644 index 00000000000..c3277676029 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -0,0 +1,241 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_data_init.h" + +__kernel void kernel_ocl_path_trace_data_init( + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ + + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, + + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, + + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, + + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, + + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, + + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, + + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, + + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, + + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, + + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, + + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, + + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, + + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, + + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, + + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, + + /* Ray differentials. */ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, + + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, + + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, + + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, + + /* Dp/Du */ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, + + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, + + /* Object motion. */ + ccl_global Transform *ob_tfm_sd, + ccl_global Transform *ob_tfm_sd_DL_shadow, + + ccl_global Transform *ob_itfm_sd, + ccl_global Transform *ob_itfm_sd_DL_shadow, + + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, + + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, + + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, + + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, + + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, + + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "../../kernel_textures.h" + + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + kernel_data_init(globals, + shader_data_sd, + shader_data_sd_DL_shadow, + P_sd, + P_sd_DL_shadow, + N_sd, + N_sd_DL_shadow, + Ng_sd, + Ng_sd_DL_shadow, + I_sd, + I_sd_DL_shadow, + shader_sd, + shader_sd_DL_shadow, + flag_sd, + flag_sd_DL_shadow, + prim_sd, + prim_sd_DL_shadow, + type_sd, + type_sd_DL_shadow, + u_sd, + u_sd_DL_shadow, + v_sd, + v_sd_DL_shadow, + object_sd, + object_sd_DL_shadow, + time_sd, + time_sd_DL_shadow, + ray_length_sd, + ray_length_sd_DL_shadow, + ray_depth_sd, + ray_depth_sd_DL_shadow, + transparent_depth_sd, + transparent_depth_sd_DL_shadow, + + /* Ray differentials. */ + dP_sd, + dP_sd_DL_shadow, + dI_sd, + dI_sd_DL_shadow, + du_sd, + du_sd_DL_shadow, + dv_sd, + dv_sd_DL_shadow, + + /* Dp/Du */ + dPdu_sd, + dPdu_sd_DL_shadow, + dPdv_sd, + dPdv_sd_DL_shadow, + + /* Object motion. */ + ob_tfm_sd, + ob_tfm_sd_DL_shadow, + ob_itfm_sd, + ob_itfm_sd_DL_shadow, + + closure_sd, + closure_sd_DL_shadow, + num_closure_sd, + num_closure_sd_DL_shadow, + randb_closure_sd, + randb_closure_sd_DL_shadow, + ray_P_sd, + ray_P_sd_DL_shadow, + ray_dP_sd, + ray_dP_sd_DL_shadow, + data, + per_sample_output_buffers, + rng_state, + rng_coop, + throughput_coop, + L_transparent_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + ray_state, + +#define KERNEL_TEX(type, ttype, name) name, +#include "../../kernel_textures.h" + + start_sample, sx, sy, sw, sh, offset, stride, + rng_state_offset_x, + rng_state_offset_y, + rng_state_stride, + Queue_data, + Queue_index, + queuesize, + use_queues_flag, + work_array, +#ifdef __WORK_STEALING__ + work_pool_wgs, + num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + debugdata_coop, +#endif + parallel_samples); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl new file mode 100644 index 00000000000..6ec75013b3a --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -0,0 +1,90 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_direct_lighting.h" + +__kernel void kernel_ocl_path_trace_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for direct lighting */ + ccl_global char *shader_DL, /* Required for direct lighting */ + ccl_global uint *rng_coop, /* Required for direct lighting */ + ccl_global PathState *PathState_coop, /* Required for direct lighting */ + ccl_global int *ISLamp_coop, /* Required for direct lighting */ + ccl_global Ray *LightRay_coop, /* Required for direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize) /* Size (capacity) of each queue */ +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = kernel_direct_lighting(globals, + data, + shader_data, + shader_DL, + rng_coop, + PathState_coop, + ISLamp_coop, + LightRay_coop, + BSDFEval_coop, + ray_state, + ray_index); + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); +#endif +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl new file mode 100644 index 00000000000..ae5f5cd1b3b --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" + +__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ + ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ + ccl_global float *L_transparent_coop, /* Required for handling holdout material */ + PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ + ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ + Intersection *Intersection_coop, /* Required for indirect primitive emission */ + ccl_global float3 *AOAlpha_coop, /* Required for AO */ + ccl_global float3 *AOBSDF_coop, /* Required for AO */ + ccl_global Ray *AOLightRay_coop, /* Required for AO */ + int sw, int sh, int sx, int sy, int stride, + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ +#ifdef __WORK_STEALING__ + unsigned int start_sample, +#endif + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + ccl_local unsigned int local_queue_atomics_bg; + ccl_local unsigned int local_queue_atomics_ao; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics_bg = 0; + local_queue_atomics_ao = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + kernel_holdout_emission_blurring_pathtermination_ao( + globals, + data, + shader_data, + per_sample_output_buffers, + rng_coop, + throughput_coop, + L_transparent_coop, + PathRadiance_coop, + PathState_coop, + Intersection_coop, + AOAlpha_coop, + AOBSDF_coop, + AOLightRay_coop, + sw, sh, sx, sy, stride, + ray_state, + work_array, +#ifdef __WORK_STEALING__ + start_sample, +#endif + parallel_samples, + ray_index, + &enqueue_flag, + &enqueue_flag_AO_SHADOW_RAY_CAST); +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics_bg, + Queue_data, + Queue_index); + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag_AO_SHADOW_RAY_CAST, + queuesize, + &local_queue_atomics_ao, + Queue_data, + Queue_index); +#endif +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl new file mode 100644 index 00000000000..1bc7808d834 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -0,0 +1,84 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_lamp_emission.h" + +__kernel void kernel_ocl_path_trace_lamp_emission( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* Used to decide if this kernel should use + * queues to fetch ray index + */ + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* We will empty this queue in this kernel. */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + /* Fetch use_queues_flag. */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 1); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + kernel_lamp_emission(globals, + data, + shader_data, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + sw, sh, + use_queues_flag, + parallel_samples, + ray_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl new file mode 100644 index 00000000000..dcf4db40411 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -0,0 +1,115 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_next_iteration_setup.h" + +__kernel void kernel_ocl_path_trace_next_iteration_setup( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for setting up ray for next iteration */ + ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ + ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ + PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ + ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ + ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ + ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should + * use queues to fetch ray index */ +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + use_queues_flag[0] = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = kernel_next_iteration_setup(globals, + data, + shader_data, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + LightRay_dl_coop, + ISLamp_coop, + BSDFEval_coop, + LightRay_ao_coop, + AOBSDF_coop, + AOAlpha_coop, + ray_state, + use_queues_flag, + ray_index); +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl new file mode 100644 index 00000000000..3156dc255fb --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../kernel_compat_opencl.h" +#include "../../kernel_math.h" +#include "../../kernel_types.h" +#include "../../kernel_globals.h" +#include "../../kernel_queues.h" + +/* + * The kernel "kernel_queue_enqueue" enqueues rays of + * different ray state into their appropriate Queues; + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel + * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output of the kernel is as follows, + * + * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | + * queuesize -------------------------------------------| | + * + * Note on Queues : + * State of queues during the first time this kernel is called : + * At entry, + * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. + * + * State of queue during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ +__kernel void kernel_ocl_path_trace_queue_enqueue( + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int queuesize) /* Size (capacity) of each queue */ +{ + /* We have only 2 cases (Hit/Not-Hit) */ + ccl_local unsigned int local_queue_atomics[2]; + + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + + if(lidx < 2 ) { + local_queue_atomics[lidx] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lidx == 0) { + local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + local_queue_atomics, + Queue_index); + local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + local_queue_atomics, + Queue_index); + } + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + queuesize, + my_lqidx, + local_queue_atomics); + Queue_data[my_gqidx] = ray_index; + } +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl new file mode 100644 index 00000000000..e5fad7bce50 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_scene_intersect.h" + +__kernel void kernel_ocl_path_trace_scene_intersect( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global uint *rng_coop, + ccl_global Ray *Ray_coop, /* Required for scene_intersect */ + ccl_global PathState *PathState_coop, /* Required for scene_intersect */ + Intersection *Intersection_coop, /* Required for scene_intersect */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use + * queues to fetch ray index */ +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + kernel_scene_intersect(globals, + data, + rng_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + sw, sh, + use_queues_flag, +#ifdef __KERNEL_DEBUG__ + debugdata_coop, +#endif + parallel_samples, + ray_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl new file mode 100644 index 00000000000..b9f616e6bdf --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_shader_eval.h" + +__kernel void kernel_ocl_path_trace_shader_eval( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Output ShaderData structure to be filled */ + ccl_global uint *rng_coop, /* Required for rbsdf calculation */ + ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ + ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ + Intersection *Intersection_coop, /* Required for setting up shader from ray */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize) /* Size (capacity) of each queue */ +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); + + /* Continue on with shader evaluation. */ + kernel_shader_eval(globals, + data, + shader_data, + rng_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + ray_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl new file mode 100644 index 00000000000..03886c0a030 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl @@ -0,0 +1,83 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_shadow_blocked.h" + +__kernel void kernel_ocl_path_trace_shadow_blocked( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_shadow, /* Required for shadow blocked */ + ccl_global PathState *PathState_coop, /* Required for shadow blocked */ + ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ + ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ + Intersection *Intersection_coop_AO, + Intersection *Intersection_coop_DL, + ccl_global char *ray_state, + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int total_num_rays) +{ +#if 0 + /* We will make the Queue_index entries '0' in the next kernel. */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* We empty this queue here */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } +#endif + + int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); + + ccl_local unsigned int ao_queue_length; + ccl_local unsigned int dl_queue_length; + if(lidx == 0) { + ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* flag determining if the current ray is to process shadow ray for AO or DL */ + char shadow_blocked_type = -1; + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(thread_index < ao_queue_length + dl_queue_length) { + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; + } else { + ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; + } + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + kernel_shadow_blocked(globals, + data, + shader_shadow, + PathState_coop, + LightRay_dl_coop, + LightRay_ao_coop, + Intersection_coop_AO, + Intersection_coop_DL, + ray_state, + total_num_rays, + shadow_blocked_type, + ray_index); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl new file mode 100644 index 00000000000..88a1ed830af --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "split/kernel_sum_all_radiance.h" + +__kernel void kernel_ocl_path_trace_sum_all_radiance( + ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ + ccl_global float *buffer, /* Output buffer of RenderTile */ + ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ + int parallel_samples, int sw, int sh, int stride, + int buffer_offset_x, + int buffer_offset_y, + int buffer_stride, + int start_sample) +{ + kernel_sum_all_radiance(data, + buffer, + per_sample_output_buffer, + parallel_samples, + sw, sh, stride, + buffer_offset_x, + buffer_offset_y, + buffer_stride, + start_sample); +} diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript index 58b0204a1b9..74ba5e1020c 100644 --- a/intern/cycles/kernel/osl/SConscript +++ b/intern/cycles/kernel/osl/SConscript @@ -44,6 +44,18 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {') defs.append('CCL_NAMESPACE_END=}') defs.append('WITH_OSL') +if env['WITH_UNORDERED_MAP_SUPPORT']: + if env['UNORDERED_MAP_HEADER'] == 'unordered_map': + if env['UNORDERED_MAP_NAMESPACE'] == 'std': + defs.append('CYCLES_STD_UNORDERED_MAP') + elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1': + defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE') + elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1': + defs.append('CYCLES_TR1_UNORDERED_MAP') +else: + print("-- Replacing unordered_map/set with map/set (warning: slower!)") + defs.append('CYCLES_NO_UNORDERED_MAP') + if env['WITH_BF_CYCLES_DEBUG']: defs.append('WITH_CYCLES_DEBUG') diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index 8f9c2efd470..43929fbe928 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -34,6 +34,7 @@ #include <OSL/genclosure.h> +#include "kernel_compat_cpu.h" #include "osl_closures.h" #include "kernel_types.h" diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index c5851747b54..497c4f0dc5c 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -34,6 +34,7 @@ #include <OSL/genclosure.h> +#include "kernel_compat_cpu.h" #include "osl_closures.h" #include "kernel_types.h" diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 84ef85e089d..88998037751 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -34,6 +34,7 @@ #include <OSL/genclosure.h> +#include "kernel_compat_cpu.h" #include "osl_bssrdf.h" #include "osl_closures.h" diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index 5e833d738d8..ef67ef52fc0 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -147,14 +147,14 @@ public: \ \ float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - pdf = 0; \ - return make_float3(0, 0, 0); \ + pdf = 0.0f; \ + return make_float3(0.0f, 0.0f, 0.0f); \ } \ \ float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - pdf = 0; \ - return make_float3(0, 0, 0); \ + pdf = 0.0f; \ + return make_float3(0.0f, 0.0f, 0.0f); \ } \ \ int sample(const float3 &Ng, \ diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 1f6015d0d6b..3c1955a1e1e 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -20,6 +20,7 @@ */ #if defined(__GNUC__) && defined(NDEBUG) # pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +# pragma GCC diagnostic ignored "-Wuninitialized" #endif #include <string.h> @@ -138,12 +139,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ - if (xform) { + if(xform) { const ShaderData *sd = (const ShaderData *)xform; KernelGlobals *kg = sd->osl_globals; int object = sd->object; - if (object != OBJECT_NONE) { + if(object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform tfm; @@ -168,12 +169,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ - if (xform) { + if(xform) { const ShaderData *sd = (const ShaderData *)xform; KernelGlobals *kg = sd->osl_globals; int object = sd->object; - if (object != OBJECT_NONE) { + if(object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform itfm; @@ -198,27 +199,27 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result { KernelGlobals *kg = kernel_globals; - if (from == u_ndc) { + if(from == u_ndc) { Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc)); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_raster) { + else if(from == u_raster) { Transform tfm = transform_transpose(kernel_data.cam.rastertoworld); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_screen) { + else if(from == u_screen) { Transform tfm = transform_transpose(kernel_data.cam.screentoworld); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_camera) { + else if(from == u_camera) { Transform tfm = transform_transpose(kernel_data.cam.cameratoworld); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_world) { + else if(from == u_world) { result.makeIdentity(); return true; } @@ -230,27 +231,27 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 { KernelGlobals *kg = kernel_globals; - if (to == u_ndc) { + if(to == u_ndc) { Transform tfm = transform_transpose(kernel_data.cam.worldtondc); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_raster) { + else if(to == u_raster) { Transform tfm = transform_transpose(kernel_data.cam.worldtoraster); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_screen) { + else if(to == u_screen) { Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_camera) { + else if(to == u_camera) { Transform tfm = transform_transpose(kernel_data.cam.worldtocamera); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_world) { + else if(to == u_world) { result.makeIdentity(); return true; } @@ -262,11 +263,11 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ - if (xform) { + if(xform) { const ShaderData *sd = (const ShaderData *)xform; int object = sd->object; - if (object != OBJECT_NONE) { + if(object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform tfm = sd->ob_tfm; #else @@ -287,11 +288,11 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ - if (xform) { + if(xform) { const ShaderData *sd = (const ShaderData *)xform; int object = sd->object; - if (object != OBJECT_NONE) { + if(object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform tfm = sd->ob_itfm; #else @@ -312,22 +313,22 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result { KernelGlobals *kg = kernel_globals; - if (from == u_ndc) { + if(from == u_ndc) { Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc)); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_raster) { + else if(from == u_raster) { Transform tfm = transform_transpose(kernel_data.cam.rastertoworld); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_screen) { + else if(from == u_screen) { Transform tfm = transform_transpose(kernel_data.cam.screentoworld); COPY_MATRIX44(&result, &tfm); return true; } - else if (from == u_camera) { + else if(from == u_camera) { Transform tfm = transform_transpose(kernel_data.cam.cameratoworld); COPY_MATRIX44(&result, &tfm); return true; @@ -340,22 +341,22 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 { KernelGlobals *kg = kernel_globals; - if (to == u_ndc) { + if(to == u_ndc) { Transform tfm = transform_transpose(kernel_data.cam.worldtondc); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_raster) { + else if(to == u_raster) { Transform tfm = transform_transpose(kernel_data.cam.worldtoraster); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_screen) { + else if(to == u_screen) { Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen); COPY_MATRIX44(&result, &tfm); return true; } - else if (to == u_camera) { + else if(to == u_camera) { Transform tfm = transform_transpose(kernel_data.cam.worldtocamera); COPY_MATRIX44(&result, &tfm); return true; @@ -373,8 +374,8 @@ bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivat static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val) { - if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector || - type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor) + if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector || + type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor) { float *fval = (float *)val; @@ -382,7 +383,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v fval[1] = f[0].y; fval[2] = f[0].z; - if (derivatives) { + if(derivatives) { fval[3] = f[1].x; fval[4] = f[1].y; fval[5] = f[1].z; @@ -398,7 +399,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v float *fval = (float *)val; fval[0] = average(f[0]); - if (derivatives) { + if(derivatives) { fval[1] = average(f[1]); fval[2] = average(f[2]); } @@ -422,15 +423,15 @@ static bool set_attribute_float3(float3 f, TypeDesc type, bool derivatives, void static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val) { - if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector || - type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor) + if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector || + type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor) { float *fval = (float *)val; fval[0] = f[0]; fval[1] = f[1]; fval[2] = f[2]; - if (derivatives) { + if(derivatives) { fval[3] = f[1]; fval[4] = f[1]; fval[5] = f[1]; @@ -446,7 +447,7 @@ static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, voi float *fval = (float *)val; fval[0] = f[0]; - if (derivatives) { + if(derivatives) { fval[1] = f[1]; fval[2] = f[2]; } @@ -474,7 +475,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val) int *ival = (int *)val; ival[0] = i; - if (derivatives) { + if(derivatives) { ival[1] = 0; ival[2] = 0; } @@ -491,7 +492,7 @@ static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, v ustring *sval = (ustring *)val; sval[0] = str; - if (derivatives) { + if(derivatives) { sval[1] = OSLRenderServices::u_empty; sval[2] = OSLRenderServices::u_empty; } @@ -521,7 +522,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives, if(type.arraylen > 3) memset(fval + 3*3, 0, sizeof(float)*3*(type.arraylen - 3)); - if (derivatives) + if(derivatives) memset(fval + type.arraylen*3, 0, sizeof(float)*2*3*type.arraylen); return true; @@ -544,15 +545,15 @@ static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val) static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr, const TypeDesc& type, bool derivatives, void *val) { - if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector || - attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) + if(attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector || + attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) { float3 fval[3]; fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL); return set_attribute_float3(fval, type, derivatives, val); } - else if (attr.type == TypeDesc::TypeFloat) { + else if(attr.type == TypeDesc::TypeFloat) { float fval[3]; fval[0] = primitive_attribute_float(kg, sd, attr.elem, attr.offset, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL); @@ -566,7 +567,7 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd, static bool get_mesh_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr, const TypeDesc& type, bool derivatives, void *val) { - if (attr.type == TypeDesc::TypeMatrix) { + if(attr.type == TypeDesc::TypeMatrix) { Transform tfm = primitive_attribute_matrix(kg, sd, attr.offset); return set_attribute_matrix(tfm, type, val); } @@ -580,7 +581,7 @@ static void get_object_attribute(const OSLGlobals::Attribute& attr, bool derivat size_t datasize = attr.value.datasize(); memcpy(val, attr.value.data(), datasize); - if (derivatives) + if(derivatives) memset((char *)val + datasize, 0, datasize * 2); } @@ -590,80 +591,80 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD /* todo: turn this into hash table? */ /* Object Attributes */ - if (name == u_object_location) { + if(name == u_object_location) { float3 f = object_location(kg, sd); return set_attribute_float3(f, type, derivatives, val); } - else if (name == u_object_index) { + else if(name == u_object_index) { float f = object_pass_id(kg, sd->object); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_geom_dupli_generated) { + else if(name == u_geom_dupli_generated) { float3 f = object_dupli_generated(kg, sd->object); return set_attribute_float3(f, type, derivatives, val); } - else if (name == u_geom_dupli_uv) { + else if(name == u_geom_dupli_uv) { float3 f = object_dupli_uv(kg, sd->object); return set_attribute_float3(f, type, derivatives, val); } - else if (name == u_material_index) { + else if(name == u_material_index) { float f = shader_pass_id(kg, sd); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_object_random) { + else if(name == u_object_random) { float f = object_random_number(kg, sd->object); return set_attribute_float(f, type, derivatives, val); } /* Particle Attributes */ - else if (name == u_particle_index) { + else if(name == u_particle_index) { int particle_id = object_particle_id(kg, sd->object); float f = particle_index(kg, particle_id); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_particle_age) { + else if(name == u_particle_age) { int particle_id = object_particle_id(kg, sd->object); float f = particle_age(kg, particle_id); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_particle_lifetime) { + else if(name == u_particle_lifetime) { int particle_id = object_particle_id(kg, sd->object); float f = particle_lifetime(kg, particle_id); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_particle_location) { + else if(name == u_particle_location) { int particle_id = object_particle_id(kg, sd->object); float3 f = particle_location(kg, particle_id); return set_attribute_float3(f, type, derivatives, val); } #if 0 /* unsupported */ - else if (name == u_particle_rotation) { + else if(name == u_particle_rotation) { int particle_id = object_particle_id(kg, sd->object); float4 f = particle_rotation(kg, particle_id); return set_attribute_float4(f, type, derivatives, val); } #endif - else if (name == u_particle_size) { + else if(name == u_particle_size) { int particle_id = object_particle_id(kg, sd->object); float f = particle_size(kg, particle_id); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_particle_velocity) { + else if(name == u_particle_velocity) { int particle_id = object_particle_id(kg, sd->object); float3 f = particle_velocity(kg, particle_id); return set_attribute_float3(f, type, derivatives, val); } - else if (name == u_particle_angular_velocity) { + else if(name == u_particle_angular_velocity) { int particle_id = object_particle_id(kg, sd->object); float3 f = particle_angular_velocity(kg, particle_id); return set_attribute_float3(f, type, derivatives, val); } /* Geometry Attributes */ - else if (name == u_geom_numpolyvertices) { + else if(name == u_geom_numpolyvertices) { return set_attribute_int(3, type, derivatives, val); } - else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices) + else if((name == u_geom_trianglevertices || name == u_geom_polyvertices) #ifdef __HAIR__ && sd->type & PRIMITIVE_ALL_TRIANGLE) #else @@ -689,21 +690,21 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD ustring object_name = kg->osl->object_names[sd->object]; return set_attribute_string(object_name, type, derivatives, val); } - else if (name == u_is_smooth) { + else if(name == u_is_smooth) { float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0); return set_attribute_float(f, type, derivatives, val); } #ifdef __HAIR__ /* Hair Attributes */ - else if (name == u_is_curve) { + else if(name == u_is_curve) { float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0; return set_attribute_float(f, type, derivatives, val); } - else if (name == u_curve_thickness) { + else if(name == u_curve_thickness) { float f = curve_thickness(kg, sd); return set_attribute_float(f, type, derivatives, val); } - else if (name == u_curve_tangent_normal) { + else if(name == u_curve_tangent_normal) { float3 f = curve_tangent_normal(kg, sd); return set_attribute_float3(f, type, derivatives, val); } @@ -715,22 +716,22 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val) { - if (name == u_path_ray_length) { + if(name == u_path_ray_length) { /* Ray Length */ float f = sd->ray_length; return set_attribute_float(f, type, derivatives, val); } - else if (name == u_path_ray_depth) { + else if(name == u_path_ray_depth) { /* Ray Depth */ int f = sd->ray_depth; return set_attribute_int(f, type, derivatives, val); } - else if (name == u_path_transparent_depth) { + else if(name == u_path_transparent_depth) { /* Transparent Ray Depth */ int f = sd->transparent_depth; return set_attribute_int(f, type, derivatives, val); } - else if (name == u_ndc) { + else if(name == u_ndc) { /* NDC coordinates with special exception for otho */ OSLThreadData *tdata = kg->osl_tdata; OSL::ShaderGlobals *globals = &tdata->globals; @@ -762,7 +763,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if (sg->renderstate == NULL) + if(sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); @@ -777,10 +778,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring int object; /* lookup of attribute on another object */ - if (object_name != u_empty) { + if(object_name != u_empty) { OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name); - if (it == kg->osl->object_name_map.end()) + if(it == kg->osl->object_name_map.end()) return false; object = it->second; @@ -790,7 +791,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object = sd->object; is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0; - if (object == OBJECT_NONE) + if(object == OBJECT_NONE) return get_background_attribute(kg, sd, name, type, derivatives, val); } @@ -799,10 +800,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object]; OSLGlobals::AttributeMap::iterator it = attribute_map.find(name); - if (it != attribute_map.end()) { + if(it != attribute_map.end()) { const OSLGlobals::Attribute& attr = it->second; - if (attr.elem != ATTR_ELEMENT_OBJECT) { + if(attr.elem != ATTR_ELEMENT_OBJECT) { /* triangle and vertex attributes */ if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val)) return true; @@ -819,7 +820,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring /* not found in attribute, check standard object info */ bool is_std_object_attribute = get_object_standard_attribute(kg, sd, name, type, derivatives, val); - if (is_std_object_attribute) + if(is_std_object_attribute) return true; return get_background_attribute(kg, sd, name, type, derivatives, val); @@ -887,7 +888,7 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options, #endif bool status; - if(filename[0] == '@' && filename.find('.') == -1) { + if(filename[0] == '@') { int slot = atoi(filename.c_str() + 1); float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); @@ -939,19 +940,33 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options, OSL::TextureSystem *ts = osl_ts; ShaderData *sd = (ShaderData *)(sg->renderstate); KernelGlobals *kg = sd->osl_globals; - OSLThreadData *tdata = kg->osl_tdata; - OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; - - OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + bool status; + if(filename[0] == '@') { + int slot = atoi(filename.c_str() + 1); + float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); + result[0] = rgba[0]; + if(nchannels > 1) + result[1] = rgba[1]; + if(nchannels > 2) + result[2] = rgba[2]; + if(nchannels > 3) + result[3] = rgba[3]; + status = true; + } + else { + OSLThreadData *tdata = kg->osl_tdata; + OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; + OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); #if OIIO_VERSION < 10500 - bool status = ts->texture3d(th, thread_info, - options, P, dPdx, dPdy, dPdz, result); + status = ts->texture3d(th, thread_info, + options, P, dPdx, dPdy, dPdz, result); #else - bool status = ts->texture3d(th, thread_info, - options, P, dPdx, dPdy, dPdz, - nchannels, result); + status = ts->texture3d(th, thread_info, + options, P, dPdx, dPdy, dPdz, + nchannels, result); #endif + } if(!status) { if(nchannels == 3 || nchannels == 4) { @@ -979,7 +994,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options, OSLThreadData *tdata = kg->osl_tdata; OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; - OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); #if OIIO_VERSION < 10500 bool status = ts->environment(th, thread_info, diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index ebf72ae11f4..8cfe0cbcbd4 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -146,11 +146,11 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, /* OSL gives us a closure tree, we flatten it into arrays per * closure type, for evaluation, sampling, etc later on. */ - if (closure->type == OSL::ClosureColor::COMPONENT) { + if(closure->type == OSL::ClosureColor::COMPONENT) { OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure; CClosurePrimitive *prim = (CClosurePrimitive *)comp->data(); - if (prim) { + if(prim) { ShaderClosure sc; #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS @@ -267,6 +267,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, if(fabsf(weight.x) > 0.0f) { sc.weight = make_float3(weight.x, 0.0f, 0.0f); sc.data0 = bssrdf->radius.x; + sc.data1 = 0.0f; sd->flag |= bssrdf_setup(&sc, sc.type); sd->closure[sd->num_closure++] = sc; } @@ -274,6 +275,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, if(fabsf(weight.y) > 0.0f) { sc.weight = make_float3(0.0f, weight.y, 0.0f); sc.data0 = bssrdf->radius.y; + sc.data1 = 0.0f; sd->flag |= bssrdf_setup(&sc, sc.type); sd->closure[sd->num_closure++] = sc; } @@ -281,6 +283,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, if(fabsf(weight.z) > 0.0f) { sc.weight = make_float3(0.0f, 0.0f, weight.z); sc.data0 = bssrdf->radius.z; + sc.data1 = 0.0f; sd->flag |= bssrdf_setup(&sc, sc.type); sd->closure[sd->num_closure++] = sc; } @@ -293,11 +296,11 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, } } } - else if (closure->type == OSL::ClosureColor::MUL) { + else if(closure->type == OSL::ClosureColor::MUL) { OSL::ClosureMul *mul = (OSL::ClosureMul *)closure; flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight); } - else if (closure->type == OSL::ClosureColor::ADD) { + else if(closure->type == OSL::ClosureColor::ADD) { OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure; flatten_surface_closure_tree(sd, path_flag, add->closureA, weight); flatten_surface_closure_tree(sd, path_flag, add->closureB, weight); @@ -316,11 +319,11 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, S OSL::ShadingContext *octx = tdata->context[(int)ctx]; int shader = sd->shader & SHADER_MASK; - if (kg->osl->surface_state[shader]) + if(kg->osl->surface_state[shader]) ss->execute(*octx, *(kg->osl->surface_state[shader]), *globals); /* flatten closure tree */ - if (globals->Ci) + if(globals->Ci) flatten_surface_closure_tree(sd, path_flag, globals->Ci); } @@ -332,23 +335,23 @@ static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure) * is only one supported closure type at the moment, which has no evaluation * functions, so we just sum the weights */ - if (closure->type == OSL::ClosureColor::COMPONENT) { + if(closure->type == OSL::ClosureColor::COMPONENT) { OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure; CClosurePrimitive *prim = (CClosurePrimitive *)comp->data(); - if (prim && prim->category == CClosurePrimitive::Background) + if(prim && prim->category == CClosurePrimitive::Background) #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS return TO_FLOAT3(comp->w); #else return make_float3(1.0f, 1.0f, 1.0f); #endif } - else if (closure->type == OSL::ClosureColor::MUL) { + else if(closure->type == OSL::ClosureColor::MUL) { OSL::ClosureMul *mul = (OSL::ClosureMul *)closure; return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure); } - else if (closure->type == OSL::ClosureColor::ADD) { + else if(closure->type == OSL::ClosureColor::ADD) { OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure; return flatten_background_closure_tree(add->closureA) + @@ -369,11 +372,11 @@ float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, int path_fl OSL::ShaderGlobals *globals = &tdata->globals; OSL::ShadingContext *octx = tdata->context[(int)ctx]; - if (kg->osl->background_state) + if(kg->osl->background_state) ss->execute(*octx, *(kg->osl->background_state), *globals); /* return background color immediately */ - if (globals->Ci) + if(globals->Ci) return flatten_background_closure_tree(globals->Ci); return make_float3(0.0f, 0.0f, 0.0f); @@ -387,11 +390,11 @@ static void flatten_volume_closure_tree(ShaderData *sd, /* OSL gives us a closure tree, we flatten it into arrays per * closure type, for evaluation, sampling, etc later on. */ - if (closure->type == OSL::ClosureColor::COMPONENT) { + if(closure->type == OSL::ClosureColor::COMPONENT) { OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure; CClosurePrimitive *prim = (CClosurePrimitive *)comp->data(); - if (prim) { + if(prim) { ShaderClosure sc; #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS @@ -448,11 +451,11 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } } - else if (closure->type == OSL::ClosureColor::MUL) { + else if(closure->type == OSL::ClosureColor::MUL) { OSL::ClosureMul *mul = (OSL::ClosureMul *)closure; flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight); } - else if (closure->type == OSL::ClosureColor::ADD) { + else if(closure->type == OSL::ClosureColor::ADD) { OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure; flatten_volume_closure_tree(sd, add->closureA, weight); flatten_volume_closure_tree(sd, add->closureB, weight); @@ -471,11 +474,11 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, Sh OSL::ShadingContext *octx = tdata->context[(int)ctx]; int shader = sd->shader & SHADER_MASK; - if (kg->osl->volume_state[shader]) + if(kg->osl->volume_state[shader]) ss->execute(*octx, *(kg->osl->volume_state[shader]), *globals); /* flatten closure tree */ - if (globals->Ci) + if(globals->Ci) flatten_volume_closure_tree(sd, globals->Ci); } @@ -493,7 +496,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte OSL::ShadingContext *octx = tdata->context[(int)ctx]; int shader = sd->shader & SHADER_MASK; - if (kg->osl->displacement_state[shader]) + if(kg->osl->displacement_state[shader]) ss->execute(*octx, *(kg->osl->displacement_state[shader]), *globals); /* get back position */ @@ -520,7 +523,7 @@ float3 OSLShader::bsdf_eval(const ShaderData *sd, const ShaderClosure *sc, const CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim; float3 bsdf_eval; - if (dot(sd->Ng, omega_in) >= 0.0f) + if(dot(sd->Ng, omega_in) >= 0.0f) bsdf_eval = bsdf->eval_reflect(sd->I, omega_in, pdf); else bsdf_eval = bsdf->eval_transmit(sd->I, omega_in, pdf); @@ -548,7 +551,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, ustring stdname(std::string("geom:") + std::string(Attribute::standard_name((AttributeStandard)id))); OSLGlobals::AttributeMap::const_iterator it = attr_map.find(stdname); - if (it != attr_map.end()) { + if(it != attr_map.end()) { const OSLGlobals::Attribute &osl_attr = it->second; *elem = osl_attr.elem; diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl index 526a87525cd..46a02cab32e 100644 --- a/intern/cycles/kernel/shaders/node_image_texture.osl +++ b/intern/cycles/kernel/shaders/node_image_texture.osl @@ -26,7 +26,7 @@ point map_to_tube(vector dir) { float u, v; v = (dir[2] + 1.0) * 0.5; - float len = sqrt(dir[0]*dir[0] + dir[1]*dir[1]); + float len = sqrt(dir[0] * dir[0] + dir[1] * dir[1]); if (len > 0.0) { u = (1.0 - (atan2(dir[0] / len, dir[1] / len) / M_PI)) * 0.5; } @@ -40,8 +40,8 @@ point map_to_sphere(vector dir) { float len = length(dir); float v, u; - if(len > 0.0) { - if(dir[0] == 0.0 && dir[1] == 0.0) { + if (len > 0.0) { + if (dir[0] == 0.0 && dir[1] == 0.0) { u = 0.0; /* Othwise domain error. */ } else { diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl index bbc008b4299..7eef97fd7e8 100644 --- a/intern/cycles/kernel/shaders/node_math.osl +++ b/intern/cycles/kernel/shaders/node_math.osl @@ -93,8 +93,8 @@ shader node_math( Value = Value1 > Value2; else if (type == "Modulo") Value = safe_modulo(Value1, Value2); - else if (type == "Absolute") - Value = fabs(Value1); + else if (type == "Absolute") + Value = fabs(Value1); if (Clamp) Value = clamp(Value, 0.0, 1.0); diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl index a349dc8cb9a..4f95dec910a 100644 --- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl +++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl @@ -26,7 +26,7 @@ * from "Texturing and Modelling: A procedural approach" */ -float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float octaves) +float noise_musgrave_fBm(point p, float H, float lacunarity, float octaves) { float rmd; float value = 0.0; @@ -54,7 +54,7 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float * octaves: number of frequencies in the fBm */ -float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunarity, float octaves) +float noise_musgrave_multi_fractal(point p, float H, float lacunarity, float octaves) { float rmd; float value = 1.0; @@ -83,7 +83,7 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar * offset: raises the terrain from `sea level' */ -float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacunarity, float octaves, float offset) +float noise_musgrave_hetero_terrain(point p, float H, float lacunarity, float octaves, float offset) { float value, increment, rmd; float pwHL = pow(lacunarity, -H); @@ -118,8 +118,8 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna * offset: raises the terrain from `sea level' */ -float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, - float lacunarity, float octaves, float offset, float gain) +float noise_musgrave_hybrid_multi_fractal(point p, float H, float lacunarity, + float octaves, float offset, float gain) { float result, signal, weight, rmd; float pwHL = pow(lacunarity, -H); @@ -156,8 +156,8 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, * offset: raises the terrain from `sea level' */ -float noise_musgrave_ridged_multi_fractal(point p, string basis, float H, - float lacunarity, float octaves, float offset, float gain) +float noise_musgrave_ridged_multi_fractal(point p, float H, float lacunarity, + float octaves, float offset, float gain) { float result, signal, weight; float pwHL = pow(lacunarity, -H); @@ -201,7 +201,6 @@ shader node_musgrave_texture( float dimension = max(Dimension, 1e-5); float octaves = clamp(Detail, 0.0, 16.0); float lacunarity = max(Lacunarity, 1e-5); - string Basis = "Perlin"; float intensity = 1.0; point p = Vector; @@ -212,15 +211,15 @@ shader node_musgrave_texture( p = p * Scale; if (Type == "Multifractal") - Fac = intensity * noise_musgrave_multi_fractal(p, Basis, dimension, lacunarity, octaves); + Fac = intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves); else if (Type == "fBM") - Fac = intensity * noise_musgrave_fBm(p, Basis, dimension, lacunarity, octaves); + Fac = intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves); else if (Type == "Hybrid Multifractal") - Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain); + Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain); else if (Type == "Ridged Multifractal") - Fac = intensity * noise_musgrave_ridged_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain); + Fac = intensity * noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain); else if (Type == "Hetero Terrain") - Fac = intensity * noise_musgrave_hetero_terrain(p, Basis, dimension, lacunarity, octaves, Offset); + Fac = intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, Offset); Color = color(Fac, Fac, Fac); } diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl index dabc0b6843f..e83e5b5b211 100644 --- a/intern/cycles/kernel/shaders/node_noise_texture.osl +++ b/intern/cycles/kernel/shaders/node_noise_texture.osl @@ -19,23 +19,23 @@ /* Noise */ -float noise(point p, string basis, float distortion, float detail, float fac, color Color) +float noise(point p, float distortion, float detail, float fac, color Color) { point r; int hard = 0; if (distortion != 0.0) { - r[0] = noise_basis(p + point(13.5), basis) * distortion; - r[1] = noise_basis(p, basis) * distortion; - r[2] = noise_basis(p - point(13.5), basis) * distortion; + r[0] = safe_noise(p + point(13.5), "unsigned") * distortion; + r[1] = safe_noise(p, "unsigned") * distortion; + r[2] = safe_noise(p - point(13.5), "unsigned") * distortion; p += r; } - fac = noise_turbulence(p, basis, detail, hard); + fac = noise_turbulence(p, detail, hard); - Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), basis, detail, hard), - noise_turbulence(point(p[1], p[2], p[0]), basis, detail, hard)); + Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), detail, hard), + noise_turbulence(point(p[1], p[2], p[0]), detail, hard)); return fac; } @@ -55,7 +55,6 @@ shader node_noise_texture( if (use_mapping) p = transform(mapping, p); - string Basis = "Perlin"; - Fac = noise(p * Scale, Basis, Distortion, Detail, Fac, Color); + Fac = noise(p * Scale, Distortion, Detail, Fac, Color); } diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h index 5f9cd5afa47..fc2cfdcd55c 100644 --- a/intern/cycles/kernel/shaders/node_texture.h +++ b/intern/cycles/kernel/shaders/node_texture.h @@ -14,32 +14,6 @@ * limitations under the License. */ -/* Voronoi Distances */ - -float voronoi_distance(string distance_metric, vector d, float e) -{ -#if 0 - if (distance_metric == "Distance Squared") -#endif - return dot(d, d); -#if 0 - if (distance_metric == "Actual Distance") - return length(d); - if (distance_metric == "Manhattan") - return fabs(d[0]) + fabs(d[1]) + fabs(d[2]); - if (distance_metric == "Chebychev") - return max(fabs(d[0]), max(fabs(d[1]), fabs(d[2]))); - if (distance_metric == "Minkovsky 1/2") - return sqrt(fabs(d[0])) + sqrt(fabs(d[1])) + sqrt(fabs(d[1])); - if (distance_metric == "Minkovsky 4") - return sqrt(sqrt(dot(d * d, d * d))); - if (distance_metric == "Minkovsky") - return pow(pow(fabs(d[0]), e) + pow(fabs(d[1]), e) + pow(fabs(d[2]), e), 1.0 / e); - - return 0.0; -#endif -} - /* Voronoi / Worley like */ color cellnoise_color(point p) @@ -51,7 +25,7 @@ color cellnoise_color(point p) return color(r, g, b); } -void voronoi(point p, string distance_metric, float e, float da[4], point pa[4]) +void voronoi(point p, float e, float da[4], point pa[4]) { /* returns distances in da and point coords in pa */ int xx, yy, zz, xi, yi, zi; @@ -71,7 +45,7 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4]) point ip = point(xx, yy, zz); point vp = (point)cellnoise_color(ip); point pd = p - (vp + ip); - float d = voronoi_distance(distance_metric, pd, e); + float d = dot(pd, pd); vp += point(xx, yy, zz); @@ -111,46 +85,6 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4]) } } -float voronoi_Fn(point p, int n) -{ - float da[4]; - point pa[4]; - - voronoi(p, "Distance Squared", 0, da, pa); - - return da[n]; -} - -float voronoi_FnFn(point p, int n1, int n2) -{ - float da[4]; - point pa[4]; - - voronoi(p, "Distance Squared", 0, da, pa); - - return da[n2] - da[n1]; -} - -float voronoi_F1(point p) { return voronoi_Fn(p, 0); } -float voronoi_F2(point p) { return voronoi_Fn(p, 1); } -float voronoi_F3(point p) { return voronoi_Fn(p, 2); } -float voronoi_F4(point p) { return voronoi_Fn(p, 3); } -float voronoi_F1F2(point p) { return voronoi_FnFn(p, 0, 1); } - -float voronoi_Cr(point p) -{ - /* crackle type pattern, just a scale/clamp of F2-F1 */ - float t = 10.0 * voronoi_F1F2(p); - return (t > 1.0) ? 1.0 : t; -} - -float voronoi_F1S(point p) { return 2.0 * voronoi_F1(p) - 1.0; } -float voronoi_F2S(point p) { return 2.0 * voronoi_F2(p) - 1.0; } -float voronoi_F3S(point p) { return 2.0 * voronoi_F3(p) - 1.0; } -float voronoi_F4S(point p) { return 2.0 * voronoi_F4(p) - 1.0; } -float voronoi_F1F2S(point p) { return 2.0 * voronoi_F1F2(p) - 1.0; } -float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; } - /* Noise Bases */ float safe_noise(point p, string type) @@ -172,39 +106,9 @@ float safe_noise(point p, string type) return f; } -float noise_basis(point p, string basis) -{ - if (basis == "Perlin") - return safe_noise(p, "unsigned"); - if (basis == "Voronoi F1") - return voronoi_F1S(p); - if (basis == "Voronoi F2") - return voronoi_F2S(p); - if (basis == "Voronoi F3") - return voronoi_F3S(p); - if (basis == "Voronoi F4") - return voronoi_F4S(p); - if (basis == "Voronoi F2-F1") - return voronoi_F1F2S(p); - if (basis == "Voronoi Crackle") - return voronoi_CrS(p); - if (basis == "Cell Noise") - return cellnoise(p); - - return 0.0; -} - -/* Soft/Hard Noise */ - -float noise_basis_hard(point p, string basis, int hard) -{ - float t = noise_basis(p, basis); - return (hard) ? fabs(2.0 * t - 1.0) : t; -} - /* Turbulence */ -float noise_turbulence(point p, string basis, float details, int hard) +float noise_turbulence(point p, float details, int hard) { float fscale = 1.0; float amp = 1.0; @@ -215,7 +119,7 @@ float noise_turbulence(point p, string basis, float details, int hard) n = (int)octaves; for (i = 0; i <= n; i++) { - float t = noise_basis(fscale * p, basis); + float t = safe_noise(fscale * p, "unsigned"); if (hard) t = fabs(2.0 * t - 1.0); @@ -228,7 +132,7 @@ float noise_turbulence(point p, string basis, float details, int hard) float rmd = octaves - floor(octaves); if (rmd != 0.0) { - float t = noise_basis(fscale * p, basis); + float t = safe_noise(fscale * p, "unsigned"); if (hard) t = fabs(2.0 * t - 1.0); diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl index df169599d08..29e143ae207 100644 --- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl +++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl @@ -37,7 +37,7 @@ shader node_voronoi_texture( float da[4]; point pa[4]; - voronoi(p * Scale, "Distance Squared", 1.0, da, pa); + voronoi(p * Scale, 1.0, da, pa); /* Colored output */ if (Coloring == "Intensity") { diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl index a95752fc592..569f284cbac 100644 --- a/intern/cycles/kernel/shaders/node_wave_texture.osl +++ b/intern/cycles/kernel/shaders/node_wave_texture.osl @@ -31,7 +31,7 @@ float wave(point p, string type, float detail, float distortion, float dscale) } if (distortion != 0.0) { - n = n + (distortion * noise_turbulence(p * dscale, "Perlin", detail, 0)); + n = n + (distortion * noise_turbulence(p * dscale, detail, 0)); } return 0.5 + 0.5 * sin(n); } diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index 6babe98717c..697a1756119 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -249,7 +249,21 @@ point rotate (point p, float angle, point a, point b) { vector axis = normalize (b - a); float cosang, sinang; + /* Older OSX has major issues with sincos() function, + * it's likely a big in OSL or LLVM. For until we've + * updated to new versions of this libraries we'll + * use a workaround to prevent possible crashes on all + * the platforms. + * + * Shouldn't be that bad because it's mainly used for + * anisotropic shader where angle is usually constant. + */ +#if 0 sincos (angle, sinang, cosang); +#else + sinang = sin (angle); + cosang = cos (angle); +#endif float cosang1 = 1.0 - cosang; float x = axis[0], y = axis[1], z = axis[2]; matrix M = matrix (x * x + (1.0 - x * x) * cosang, diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h new file mode 100644 index 00000000000..181a1054a0d --- /dev/null +++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h @@ -0,0 +1,255 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_background_buffer_update kernel. + * This is the fourth kernel in the ray tracing logic, and the third + * of the path iteration kernels. This kernel takes care of rays that hit + * the background (sceneintersect kernel), and for the rays of + * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in + * the output buffer. This kernel also takes care of rays that have been determined + * to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state + * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop + * throughput_coop --------------------------------------| |--- L_transparent_coop + * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers + * Ray_coop ---------------------------------------------| |--- ray_state + * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array + * parallel_samples -------------------------------------| |--- PathState_coop + * end_sample -------------------------------------------| |--- throughput_coop + * kg (globals + data) ----------------------------------| |--- rng_coop + * rng_state --------------------------------------------| |--- Ray + * PathRadiance_coop ------------------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| |--- work_array + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| |--- work_pool_wgs + * work_pool_wgs ----------------------------------------| | + * num_samples ------------------------------------------| | + * + * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. + * Note on Queues : + * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty + */ +ccl_device char kernel_background_buffer_update( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* Required for buffer Update */ + ccl_global float3 *throughput_coop, /* Required for background hit processing */ + PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ + ccl_global Ray *Ray_coop, /* Required for background hit processing */ + ccl_global PathState *PathState_coop, /* Required for background hit processing */ + ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ + ccl_global char *ray_state, /* Stores information on the current state of a ray */ + int sw, int sh, int sx, int sy, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global unsigned int *work_array, /* Denotes work of each ray */ + int end_sample, + int start_sample, +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) +{ + char enqueue_flag = 0; + + /* Load kernel globals structure and ShaderData strucuture */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = L = &PathRadiance_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global float *L_transparent = &L_transparent_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + ccl_global float *initial_per_sample_output_buffers; + ccl_global uint *initial_rng; +#endif + unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, + &tile_x, &tile_y, + my_work, + sw, sh, sx, sy, + parallel_samples, + ray_index); + my_sample_tile = 0; + initial_per_sample_output_buffers = per_sample_output_buffers; + initial_rng = rng_state; +#else /* __WORK_STEALING__ */ + sample = work_array[ray_index]; + int tile_index = ray_index / parallel_samples; + /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif /* __WORK_STEALING__ */ + + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + *L_transparent = (*L_transparent) + average((*throughput)); +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + { +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, state, ray, sd); + path_radiance_accum_background(L, (*throughput), L_background, state->bounce); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + float3 L_sum = path_radiance_clamp_and_sum(kg, L); + kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); +#endif + float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); + + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + /* We have completed current work; So get next work */ + int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + if(!valid_work) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#else /* __WORK_STEALING__ */ + if((sample + parallel_samples) >= end_sample) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#endif /* __WORK_STEALING__ */ + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + work_array[ray_index] = my_work; + /* Get the sample associated with the current work */ + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + /* Get pixel and tile position associated with current work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + + /* Remap rng_state according to the current work */ + rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); + /* Remap per_sample_output_buffers according to the current work */ + per_sample_output_buffers = initial_per_sample_output_buffers + + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; +#else /* __WORK_STEALING__ */ + work_array[ray_index] = sample + parallel_samples; + sample = work_array[ray_index]; + + /* Get ray position from ray index */ + pixel_x = sx + ((ray_index / parallel_samples) % sw); + pixel_y = sy + ((ray_index / parallel_samples) / sw); +#endif /* __WORK_STEALING__ */ + + /* Initialize random numbers and ray. */ + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; + * These rays proceed with path-iteration. + */ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + *L_transparent = 0.0f; + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, state, rng, sample, ray); +#ifdef __KERNEL_DEBUG__ + debug_data_init(debug_data); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } else { + /* These rays do not participate in path-iteration. */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Accumulate result in output buffer. */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } + return enqueue_flag; +} diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h new file mode 100644 index 00000000000..2cd98e466c1 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -0,0 +1,418 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_data_initialization kernel + * This kernel Initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + * + * Its input and output are as follows, + * + * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng + * Un-initialized throughput -------| |--- Initialized throughput + * Un-initialized L_transparent ----| |--- Initialized L_transparent + * Un-initialized PathRadiance -----| |--- Initialized PathRadiance + * Un-initialized Ray --------------| |--- Initialized Ray + * Un-initialized PathState --------| |--- Initialized PathState + * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) + * Un-initilaized QueueIndex -------| |--- Initialized QueueIndex (to 0) + * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) + * Un-initialized ray_state --------| |--- Initialized ray_state + * parallel_samples --------------- | |--- Initialized per_sample_output_buffers + * rng_state -----------------------| |--- Initialized work_array + * data ----------------------------| |--- Initialized work_pool_wgs + * start_sample --------------------| | + * sx ------------------------------| | + * sy ------------------------------| | + * sw ------------------------------| | + * sh ------------------------------| | + * stride --------------------------| | + * queuesize -----------------------| | + * num_samples ---------------------| | + * + * Note on Queues : + * All slots in queues are initialized to queue empty slot; + * The number of elements in the queues is initialized to 0; + */ +ccl_device void kernel_data_init( + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ + + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, + + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, + + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, + + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, + + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, + + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, + + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, + + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, + + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, + + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, + + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, + + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, + + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, + + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, + + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, + + /* Ray differentials. */ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, + + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, + + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, + + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, + + /* Dp/Du */ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, + + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, + + /* Object motion. */ + ccl_global Transform *ob_tfm_sd, + ccl_global Transform *ob_tfm_sd_DL_shadow, + + ccl_global Transform *ob_itfm_sd, + ccl_global Transform *ob_itfm_sd_DL_shadow, + + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, + + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, + + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, + + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, + + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, + + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "../kernel_textures.h" + + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples) /* Number of samples to be processed in parallel */ +{ + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + + kg->data = data; +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "../kernel_textures.h" + + /* Load ShaderData structure */ + ShaderData *sd = (ShaderData *)shader_data_sd; + ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow; + + sd->P = P_sd; + sd_DL_shadow->P = P_sd_DL_shadow; + + sd->N = N_sd; + sd_DL_shadow->N = N_sd_DL_shadow; + + sd->Ng = Ng_sd; + sd_DL_shadow->Ng = Ng_sd_DL_shadow; + + sd->I = I_sd; + sd_DL_shadow->I = I_sd_DL_shadow; + + sd->shader = shader_sd; + sd_DL_shadow->shader = shader_sd_DL_shadow; + + sd->flag = flag_sd; + sd_DL_shadow->flag = flag_sd_DL_shadow; + + sd->prim = prim_sd; + sd_DL_shadow->prim = prim_sd_DL_shadow; + + sd->type = type_sd; + sd_DL_shadow->type = type_sd_DL_shadow; + + sd->u = u_sd; + sd_DL_shadow->u = u_sd_DL_shadow; + + sd->v = v_sd; + sd_DL_shadow->v = v_sd_DL_shadow; + + sd->object = object_sd; + sd_DL_shadow->object = object_sd_DL_shadow; + + sd->time = time_sd; + sd_DL_shadow->time = time_sd_DL_shadow; + + sd->ray_length = ray_length_sd; + sd_DL_shadow->ray_length = ray_length_sd_DL_shadow; + + sd->ray_depth = ray_depth_sd; + sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow; + + sd->transparent_depth = transparent_depth_sd; + sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow; + +#ifdef __RAY_DIFFERENTIALS__ + sd->dP = dP_sd; + sd_DL_shadow->dP = dP_sd_DL_shadow; + + sd->dI = dI_sd; + sd_DL_shadow->dI = dI_sd_DL_shadow; + + sd->du = du_sd; + sd_DL_shadow->du = du_sd_DL_shadow; + + sd->dv = dv_sd; + sd_DL_shadow->dv = dv_sd_DL_shadow; +#ifdef __DPDU__ + sd->dPdu = dPdu_sd; + sd_DL_shadow->dPdu = dPdu_sd_DL_shadow; + + sd->dPdv = dPdv_sd; + sd_DL_shadow->dPdv = dPdv_sd_DL_shadow; +#endif +#endif + +#ifdef __OBJECT_MOTION__ + sd->ob_tfm = ob_tfm_sd; + sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow; + + sd->ob_itfm = ob_itfm_sd; + sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow; +#endif + + sd->closure = closure_sd; + sd_DL_shadow->closure = closure_sd_DL_shadow; + + sd->num_closure = num_closure_sd; + sd_DL_shadow->num_closure = num_closure_sd_DL_shadow; + + sd->randb_closure = randb_closure_sd; + sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow; + + sd->ray_P = ray_P_sd; + sd_DL_shadow->ray_P = ray_P_sd_DL_shadow; + + sd->ray_dP = ray_dP_sd; + sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow; + + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + +#ifdef __WORK_STEALING__ + int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); + /* Initialize work_pool_wgs */ + if(lid == 0) { + int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); + work_pool_wgs[group_index] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif /* __WORK_STEALING__ */ + + /* Initialize queue data and queue index. */ + if(thread_index < queuesize) { + /* Initialize active ray queue */ + Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize background and buffer update queue */ + Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of AO queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of direct lighting queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } + + if(thread_index == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + /* The scene-intersect kernel should not use the queues very first time. + * since the queue would be empty. + */ + use_queues_flag[0] = 0; + } + + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < (sw * parallel_samples) && y < sh) { + + int ray_index = x + y * (sw * parallel_samples); + + /* This is the first assignment to ray_state; + * So we dont use ASSIGN_RAY_STATE macro. + */ + ray_state[ray_index] = RAY_ACTIVE; + + unsigned int my_sample; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int tile_x; + unsigned int tile_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + unsigned int my_work = 0; + /* Get work. */ + get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + /* Get the sample associated with the work. */ + my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + + my_sample_tile = 0; + + /* Get pixel and tile position associated with the work. */ + get_pixel_tile_position(&pixel_x, &pixel_y, + &tile_x, &tile_y, + my_work, + sw, sh, sx, sy, + parallel_samples, + ray_index); + work_array[ray_index] = my_work; +#else /* __WORK_STEALING__ */ + unsigned int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); + my_sample = my_sample_tile + start_sample; + + /* Initialize work array. */ + work_array[ray_index] = my_sample ; + + /* Calculate pixel position of this ray. */ + pixel_x = sx + tile_x; + pixel_y = sy + tile_y; +#endif /* __WORK_STEALING__ */ + + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + + /* Initialise per_sample_output_buffers to all zeros. */ + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; + int per_sample_output_buffers_iterator = 0; + for(per_sample_output_buffers_iterator = 0; + per_sample_output_buffers_iterator < kernel_data.film.pass_stride; + per_sample_output_buffers_iterator++) + { + per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; + } + + /* Initialize random numbers and ray. */ + kernel_path_trace_setup(kg, + rng_state, + my_sample, + pixel_x, pixel_y, + &rng_coop[ray_index], + &Ray_coop[ray_index]); + + if(Ray_coop[ray_index].t != 0.0f) { + /* Initialize throuput, L_transparent, Ray, PathState; + * These rays proceed with path-iteration. + */ + throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + L_transparent_coop[ray_index] = 0.0f; + path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, + &PathState_coop[ray_index], + &rng_coop[ray_index], + my_sample, + &Ray_coop[ray_index]); +#ifdef __KERNEL_DEBUG__ + debug_data_init(&debugdata_coop[ray_index]); +#endif + } else { + /* These rays do not participate in path-iteration. */ + + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Accumulate result in output buffer. */ + kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); + path_rng_end(kg, rng_state, rng_coop[ray_index]); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + + /* Mark rest of the ray-state indices as RAY_INACTIVE. */ + if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { + /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ + ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + } +} diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h new file mode 100644 index 00000000000..50c83d06140 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -0,0 +1,116 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_direct_lighting kernel. + * This is the eighth kernel in the ray tracing logic. This is the seventh + * of the path iteration kernels. This kernel takes care of direct lighting + * logic. However, the "shadow ray cast" part of direct lighting is handled + * in the next kernel. + * + * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. + * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * + * The input and output are as follows, + * + * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop + * PathState_coop -----------------------------------| |--- ISLamp_coop + * shader_data --------------------------------------| |--- LightRay_coop + * ray_state ----------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (globals + data) ------------------------------| | + * queuesize ----------------------------------------| | + * + * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself. + * Note on Queues : + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked + * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. + * + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this + * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + */ +ccl_device char kernel_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for direct lighting */ + ccl_global char *shader_DL, /* Required for direct lighting */ + ccl_global uint *rng_coop, /* Required for direct lighting */ + ccl_global PathState *PathState_coop, /* Required for direct lighting */ + ccl_global int *ISLamp_coop, /* Required for direct lighting */ + ccl_global Ray *LightRay_coop, /* Required for direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int ray_index) +{ + char enqueue_flag = 0; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* Load kernel globals structure and ShaderData structure. */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + ShaderData *sd_DL = (ShaderData *)shader_DL; + + ccl_global PathState *state = &PathState_coop[ray_index]; + + /* direct lighting */ +#ifdef __EMISSION__ + if((kernel_data.integrator.use_direct_light && + (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + { + /* Sample illumination from lights to find path contribution. */ + ccl_global RNG* rng = &rng_coop[ray_index]; + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, + light_t, light_u, light_v, + ccl_fetch(sd, time), + ccl_fetch(sd, P), + state->bounce, + &ls); + + Ray light_ray; +#ifdef __OBJECT_MOTION__ + light_ray.time = ccl_fetch(sd, time); +#endif + + BsdfEval L_light; + bool is_lamp; + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, + state->bounce, state->transparent_bounce, sd_DL)) + { + /* Write intermediate data to global memory to access from + * the next kernel. + */ + LightRay_coop[ray_index] = light_ray; + BSDFEval_coop[ray_index] = L_light; + ISLamp_coop[ray_index] = is_lamp; + /* Mark ray state for next shadow kernel. */ + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + enqueue_flag = 1; + } + } +#endif /* __EMISSION__ */ + } + return enqueue_flag; +} diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h new file mode 100644 index 00000000000..a75523a3e53 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -0,0 +1,264 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. + * This is the sixth kernel in the ray tracing logic. This is the fifth + * of the path iteration kernels. This kernel takes care of the logic to process + * "material of type holdout", indirect primitive emission, bsdf blurring, + * probabilistic path termination and AO. + * + * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. + * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- PathState_coop + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * Intersection_coop ------------------------------------| |--- L_transparent_coop + * PathState_coop ---------------------------------------| |--- per_sample_output_buffers + * L_transparent_coop -----------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- ShaderData + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop + * kg (globals + data) ----------------------------------| |--- AOBSDF_coop + * parallel_samples -------------------------------------| |--- AOLightRay_coop + * per_sample_output_buffers ----------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| | + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| | + * + * Note on Queues : + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFFER, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + */ +ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ + ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ + ccl_global float *L_transparent_coop, /* Required for handling holdout material */ + PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ + ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ + Intersection *Intersection_coop, /* Required for indirect primitive emission */ + ccl_global float3 *AOAlpha_coop, /* Required for AO */ + ccl_global float3 *AOBSDF_coop, /* Required for AO */ + ccl_global Ray *AOLightRay_coop, /* Required for AO */ + int sw, int sh, int sx, int sy, int stride, + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ +#ifdef __WORK_STEALING__ + unsigned int start_sample, +#endif + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index, + char *enqueue_flag, + char *enqueue_flag_AO_SHADOW_RAY_CAST) +{ + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + unsigned int pixel_x; + unsigned int pixel_y; +#endif + unsigned int tile_x; + unsigned int tile_y; + int my_sample_tile; + unsigned int sample; + + ccl_global RNG *rng = 0x0; + ccl_global PathState *state = 0x0; + float3 throughput; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + throughput = throughput_coop[ray_index]; + state = &PathState_coop[ray_index]; + rng = &rng_coop[ray_index]; +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, + &tile_x, &tile_y, + my_work, + sw, sh, sx, sy, + parallel_samples, + ray_index); + my_sample_tile = 0; +#else /* __WORK_STEALING__ */ + sample = work_array[ray_index]; + /* Buffer's stride is "stride"; Find x and y using ray_index. */ + int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif /* __WORK_STEALING__ */ + per_sample_output_buffers += + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * + kernel_data.film.pass_stride; + + /* holdout */ +#ifdef __HOLDOUT__ + if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && + (state->flag & PATH_RAY_CAMERA)) + { + if(kernel_data.background.transparent) { + float3 holdout_weight; + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + else + holdout_weight = shader_holdout_eval(kg, sd); + + /* any throughput is ok, should all be identical here */ + L_transparent_coop[ray_index] += average(holdout_weight*throughput); + } + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; + } + } +#endif /* __HOLDOUT__ */ + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + PathRadiance *L = &PathRadiance_coop[ray_index]; + /* Holdout mask objects do not write data passes. */ + kernel_write_data_passes(kg, + per_sample_output_buffers, + L, + sd, + sample, + state, + throughput); + /* Blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy. + */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(ccl_fetch(sd, flag) & SD_EMISSION) { + /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ + float3 emission = indirect_primitive_emission( + kg, + sd, + Intersection_coop[ray_index].t, + state->flag, + state->ray_pdf); + path_radiance_accum_emission(L, throughput, emission, state->bounce); + } +#endif /* __EMISSION__ */ + + /* Path termination. this is a strange place to put the termination, it's + * mainly due to the mixed in MIS that we use. gives too many unneeded + * shader evaluations, only need emission if we are going to terminate. + */ + float probability = path_state_terminate_probability(kg, state, throughput); + + if(probability == 0.0f) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(probability != 1.0f) { + float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + if(terminate >= probability) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; + } else { + throughput_coop[ray_index] = throughput/probability; + } + } + } + } + +#ifdef __AO__ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* ambient occlusion */ + if(kernel_data.integrator.use_ambient_occlusion || + (ccl_fetch(sd, flag) & SD_AO)) + { + /* todo: solve correlation */ + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + + float3 ao_D; + float ao_pdf; + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray _ray; + _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.D = ao_D; + _ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + _ray.time = ccl_fetch(sd, time); +#endif + _ray.dP = ccl_fetch(sd, dP); + _ray.dD = differential3_zero(); + AOLightRay_coop[ray_index] = _ray; + + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + *enqueue_flag_AO_SHADOW_RAY_CAST = 1; + } + } + } +#endif /* __AO__ */ +} diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h new file mode 100644 index 00000000000..a8e4b0a06c8 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -0,0 +1,179 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_lamp_emission + * This is the 3rd kernel in the ray-tracing logic. This is the second of the + * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. + * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE + * and RAY_HIT_BACKGROUND. + * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. + * The input/output of the kernel is as follows, + * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop + * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * kg (globals + data) --------------------------------| | + * Intersection_coop ----------------------------------| | + * ray_state ------------------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | + * queuesize ------------------------------------------| | + * use_queues_flag ------------------------------------| | + * sw -------------------------------------------------| | + * sh -------------------------------------------------| | + * parallel_samples -----------------------------------| | + * + * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel. + */ +ccl_device void kernel_lamp_emission( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global char *use_queues_flag, /* Used to decide if this kernel should use + * queues to fetch ray index + */ + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) +{ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = &PathRadiance_coop[ray_index]; + + float3 throughput = throughput_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray.P - state.ray_t*ray.D; + state.ray_t += Intersection_coop[ray_index].t; + light_ray.D = ray.D; + light_ray.t = state.ray_t; + light_ray.time = ray.time; + light_ray.dD = ray.dD; + light_ray.dP = ray.dP; + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) { + path_radiance_accum_emission(L, throughput, emission, state.bounce); + } + } +#endif /* __LAMP_MIS__ */ + + /* __VOLUME__ feature is disabled */ +#if 0 +#ifdef __VOLUME__ + /* volume attenuation, emission, scatter */ + if(state.volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = ray; + volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); + +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = false; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } + } + else +#endif /* __VOLUME_DECOUPLED__ */ + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } +#endif /* __VOLUME_SCATTER__ */ + } + } +#endif /* __VOLUME__ */ +#endif + } +} diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h new file mode 100644 index 00000000000..2dbdabc5fd3 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -0,0 +1,145 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_setup_next_iteration kernel. + * This is the tenth kernel in the ray tracing logic. This is the ninth + * of the path iteration kernels. This kernel takes care of setting up + * Ray for the next iteration of path-iteration and accumulating radiance + * corresponding to AO and direct-lighting + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * PathState_coop ---------------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- PathState_coop + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag + * Ray_coop ---------------------------------------------| | + * kg (globals + data) ----------------------------------| | + * LightRay_dl_coop -------------------------------------| + * ISLamp_coop ------------------------------------------| + * BSDFEval_coop ----------------------------------------| + * LightRay_ao_coop -------------------------------------| + * AOBSDF_coop ------------------------------------------| + * AOAlpha_coop -----------------------------------------| + * + * Note on queues, + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFF, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + */ +ccl_device char kernel_next_iteration_setup( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for setting up ray for next iteration */ + ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ + ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ + PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ + ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ + ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ + ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should + * use queues to fetch ray index */ + int ray_index) +{ + char enqueue_flag = 0; + + /* Load kernel globals structure and ShaderData structure. */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = 0x0; + ccl_global PathState *state = 0x0; + + /* Path radiance update for AO/Direct_lighting's shadow blocked. */ + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || + IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) + { + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + float3 _throughput = throughput_coop[ray_index]; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + float3 shadow = LightRay_ao_coop[ray_index].P; + char update_path_radiance = LightRay_ao_coop[ray_index].t; + if(update_path_radiance) { + path_radiance_accum_ao(L, + _throughput, + AOAlpha_coop[ray_index], + AOBSDF_coop[ray_index], + shadow, + state->bounce); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + } + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { + float3 shadow = LightRay_dl_coop[ray_index].P; + char update_path_radiance = LightRay_dl_coop[ray_index].t; + if(update_path_radiance) { + BsdfEval L_light = BSDFEval_coop[ray_index]; + path_radiance_accum_light(L, + _throughput, + &L_light, + shadow, + 1.0f, + state->bounce, + ISLamp_coop[ray_index]); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global RNG* rng = &rng_coop[ray_index]; + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } + + return enqueue_flag; +} diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h new file mode 100644 index 00000000000..73aa005a496 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -0,0 +1,137 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_scene_intersect kernel. + * This is the second kernel in the ray tracing logic. This is the first + * of the path iteration kernels. This kernel takes care of scene_intersect function. + * + * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. + * This kernel processes rays of ray state RAY_ACTIVE + * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. + * + * The input and output are as follows, + * + * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState + * PathState_coop ---------------------------------| |--- Intersection + * ray_state --------------------------------------| |--- ray_state + * use_queues_flag --------------------------------| | + * parallel_samples -------------------------------| | + * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (data + globals) ----------------------------| | + * rng_coop ---------------------------------------| | + * sw ---------------------------------------------| | + * sh ---------------------------------------------| | + * queuesize --------------------------------------| | + * + * Note on Queues : + * Ideally we would want kernel_scene_intersect to work on queues. + * But during the very first time, the queues will be empty and hence we perform a direct mapping + * between ray-index and thread-index; From the next time onward, the queue will be filled and + * we may start operating on queues. + * + * State of queue during the first time this kernel is called : + * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel + * + * State of queues during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; + * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These + * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing + * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from + * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays + * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and + * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + */ + +ccl_device void kernel_scene_intersect( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global uint *rng_coop, + ccl_global Ray *Ray_coop, /* Required for scene_intersect */ + ccl_global PathState *PathState_coop, /* Required for scene_intersect */ + Intersection *Intersection_coop, /* Required for scene_intersect */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global char *use_queues_flag, /* used to decide if this kernel should use + * queues to fetch ray index */ +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) +{ + /* All regenerated rays become active here */ + if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + + if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) + return; + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + Intersection *isect = &Intersection_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + /* intersect scene */ + uint visibility = path_state_ray_visibility(kg, &state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + RNG rng = rng_coop[ray_index]; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray.dD.dx + ray.dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + } + + bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data->num_bvh_traversal_steps += isect->num_traversal_steps; + debug_data->num_bvh_traversed_instances += isect->num_traversed_instances; + } + debug_data->num_ray_bounces++; +#endif + + if(!hit) { + /* Change the state of rays that hit the background; + * These rays undergo special processing in the + * background_bufferUpdate kernel*/ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + } +} diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h new file mode 100644 index 00000000000..e6fdc592586 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -0,0 +1,75 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_shader_eval kernel + * This kernel is the 5th kernel in the ray tracing logic. This is + * the 4rd kernel in path iteration. This kernel sets up the ShaderData + * structure from the values computed by the previous kernels. It also identifies + * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * The input and output of the kernel is as follows, + * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data + * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Intersection_coop ----------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | + * ray_state ------------------------------------------| | + * kg (globals + data) --------------------------------| | + * queuesize ------------------------------------------| | + * + * Note on Queues : + * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; + * State of queues when this kernel is called, + * at entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * at exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays + */ +ccl_device void kernel_shader_eval( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Output ShaderData structure to be filled */ + ccl_global uint *rng_coop, /* Required for rbsdf calculation */ + ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ + ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ + Intersection *Intersection_coop, /* Required for setting up shader from ray */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int ray_index) +{ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + Intersection *isect = &Intersection_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + ccl_global PathState *state = &PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + shader_setup_from_ray(kg, + sd, + isect, + &ray, + state->bounce, + state->transparent_bounce); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); + shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + } +} diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h new file mode 100644 index 00000000000..154ec53ffbb --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* Note on kernel_shadow_blocked kernel. + * This is the ninth kernel in the ray tracing logic. This is the eighth + * of the path iteration kernels. This kernel takes care of "shadow ray cast" + * logic of the direct lighting and AO part of ray tracing. + * + * The input and output are as follows, + * + * PathState_coop ----------------------------------|--- kernel_shadow_blocked --| + * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop + * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop + * ray_state ---------------------------------------| |--- ray_state + * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * kg (globals + data) -----------------------------| | + * queuesize ---------------------------------------| | + * + * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself. + * Note on queues : + * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty + * these queues this kernel. + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO + * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. + */ +ccl_device void kernel_shadow_blocked( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_shadow, /* Required for shadow blocked */ + ccl_global PathState *PathState_coop, /* Required for shadow blocked */ + ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ + ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ + Intersection *Intersection_coop_AO, + Intersection *Intersection_coop_DL, + ccl_global char *ray_state, + int total_num_rays, + char shadow_blocked_type, + int ray_index) +{ + /* Flag determining if we need to update L. */ + char update_path_radiance = 0; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || + IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) + { + /* Load kernel global structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd_shadow = (ShaderData *)shader_shadow; + + ccl_global PathState *state = &PathState_coop[ray_index]; + ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; + ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; + Intersection *isect_ao_global = &Intersection_coop_AO[ray_index]; + Intersection *isect_dl_global = &Intersection_coop_DL[ray_index]; + + ccl_global Ray *light_ray_global = + shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO + ? light_ray_ao_global + : light_ray_dl_global; + Intersection *isect_global = + RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global; + + float3 shadow; + update_path_radiance = !(shadow_blocked(kg, + state, + light_ray_global, + &shadow, + sd_shadow, + isect_global)); + + /* We use light_ray_global's P and t to store shadow and + * update_path_radiance. + */ + light_ray_global->P = shadow; + light_ray_global->t = update_path_radiance; + } +} diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h new file mode 100644 index 00000000000..e1c7e2cea99 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -0,0 +1,62 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_H__ +#define __KERNEL_SPLIT_H__ + +#include "kernel_compat_opencl.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" + +#include "util_atomic.h" + +#include "kernel_random.h" +#include "kernel_projection.h" +#include "kernel_montecarlo.h" +#include "kernel_differential.h" +#include "kernel_camera.h" + +#include "geom/geom.h" + +#include "kernel_accumulate.h" +#include "kernel_shader.h" +#include "kernel_light.h" +#include "kernel_passes.h" + +#ifdef __SUBSURFACE__ +#include "kernel_subsurface.h" +#endif + +#ifdef __VOLUME__ +#include "kernel_volume.h" +#endif + +#include "kernel_path_state.h" +#include "kernel_shadow.h" +#include "kernel_emission.h" +#include "kernel_path_common.h" +#include "kernel_path_surface.h" +#include "kernel_path_volume.h" + +#ifdef __KERNEL_DEBUG__ +#include "kernel_debug.h" +#endif + +#include "kernel_queues.h" +#include "kernel_work_stealing.h" + +#endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h new file mode 100644 index 00000000000..a21e9b6a0b1 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../kernel_compat_opencl.h" +#include "../kernel_math.h" +#include "../kernel_types.h" +#include "../kernel_globals.h" + +/* Since we process various samples in parallel; The output radiance of different samples + * are stored in different locations; This kernel combines the output radiance contributed + * by all different samples and stores them in the RenderTile's output buffer. + */ +ccl_device void kernel_sum_all_radiance( + ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ + ccl_global float *buffer, /* Output buffer of RenderTile */ + ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ + int parallel_samples, int sw, int sh, int stride, + int buffer_offset_x, + int buffer_offset_y, + int buffer_stride, + int start_sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < sw && y < sh) { + buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); + per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); + + int sample_stride = (data->film.pass_stride); + + int sample_iterator = 0; + int pass_stride_iterator = 0; + int num_floats = data->film.pass_stride; + + for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { + for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { + *(buffer + pass_stride_iterator) = + (start_sample == 0 && sample_iterator == 0) + ? *(per_sample_output_buffer + pass_stride_iterator) + : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); + } + per_sample_output_buffer += sample_stride; + } + } +} diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index d59c9b9e61c..15ac6519780 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -87,7 +87,7 @@ ccl_device_inline int stack_load_int(float *stack, uint a) return __float_as_int(stack[a]); } -ccl_device_inline float stack_load_int_default(float *stack, uint a, uint value) +ccl_device_inline int stack_load_int_default(float *stack, uint a, uint value) { return (a == (uint)SVM_STACK_INVALID)? (int)value: stack_load_int(stack, a); } @@ -142,6 +142,8 @@ CCL_NAMESPACE_END #include "svm_noise.h" #include "svm_texture.h" +#include "svm_math_util.h" + #include "svm_attribute.h" #include "svm_gradient.h" #include "svm_blackbody.h" @@ -164,7 +166,6 @@ CCL_NAMESPACE_END #include "svm_mapping.h" #include "svm_normal.h" #include "svm_wave.h" -#include "svm_math_util.h" #include "svm_math.h" #include "svm_mix.h" #include "svm_ramp.h" @@ -181,17 +182,20 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN -/* Main Interpreter Loop */ +#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__) +#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0) +/* Main Interpreter Loop */ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = sd->shader & SHADER_MASK; + int offset = ccl_fetch(sd, shader) & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); switch(node.x) { +#if NODES_GROUP(NODE_GROUP_LEVEL_0) case NODE_SHADER_JUMP: { if(type == SHADER_TYPE_SURFACE) offset = node.y; else if(type == SHADER_TYPE_VOLUME) offset = node.z; @@ -208,15 +212,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_CLOSURE_BACKGROUND: svm_node_closure_background(sd, stack, node); break; - case NODE_CLOSURE_HOLDOUT: - svm_node_closure_holdout(sd, stack, node); - break; - case NODE_CLOSURE_AMBIENT_OCCLUSION: - svm_node_closure_ambient_occlusion(sd, stack, node); - break; - case NODE_CLOSURE_VOLUME: - svm_node_closure_volume(kg, sd, stack, node, path_flag); - break; case NODE_CLOSURE_SET_WEIGHT: svm_node_closure_set_weight(sd, node.y, node.z, node.w); break; @@ -237,13 +232,137 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade if(stack_load_float(stack, node.z) == 1.0f) offset += node.y; break; -#ifdef __TEXTURES__ + case NODE_GEOMETRY: + svm_node_geometry(kg, sd, stack, node.y, node.z); + break; + case NODE_CONVERT: + svm_node_convert(sd, stack, node.y, node.z, node.w); + break; + case NODE_TEX_COORD: + svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset); + break; + case NODE_VALUE_F: + svm_node_value_f(kg, sd, stack, node.y, node.z); + break; + case NODE_VALUE_V: + svm_node_value_v(kg, sd, stack, node.y, &offset); + break; + case NODE_ATTR: + svm_node_attr(kg, sd, stack, node); + break; +# if NODES_FEATURE(NODE_FEATURE_BUMP) + case NODE_GEOMETRY_BUMP_DX: + svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); + break; + case NODE_GEOMETRY_BUMP_DY: + svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); + break; + case NODE_SET_DISPLACEMENT: + svm_node_set_displacement(sd, stack, node.y); + break; +# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ +# ifdef __TEXTURES__ case NODE_TEX_IMAGE: svm_node_tex_image(kg, sd, stack, node); break; case NODE_TEX_IMAGE_BOX: svm_node_tex_image_box(kg, sd, stack, node); break; + case NODE_TEX_NOISE: + svm_node_tex_noise(kg, sd, stack, node, &offset); + break; +# endif /* __TEXTURES__ */ +# ifdef __EXTRA_NODES__ +# if NODES_FEATURE(NODE_FEATURE_BUMP) + case NODE_SET_BUMP: + svm_node_set_bump(kg, sd, stack, node); + break; + case NODE_ATTR_BUMP_DX: + svm_node_attr_bump_dx(kg, sd, stack, node); + break; + case NODE_ATTR_BUMP_DY: + svm_node_attr_bump_dy(kg, sd, stack, node); + break; + case NODE_TEX_COORD_BUMP_DX: + svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset); + break; + case NODE_TEX_COORD_BUMP_DY: + svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset); + break; + case NODE_CLOSURE_SET_NORMAL: + svm_node_set_normal(kg, sd, stack, node.y, node.z); + break; +# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ + case NODE_HSV: + svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); + break; +# endif /* __EXTRA_NODES__ */ +#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */ + +#if NODES_GROUP(NODE_GROUP_LEVEL_1) + case NODE_CLOSURE_HOLDOUT: + svm_node_closure_holdout(sd, stack, node); + break; + case NODE_CLOSURE_AMBIENT_OCCLUSION: + svm_node_closure_ambient_occlusion(sd, stack, node); + break; + case NODE_FRESNEL: + svm_node_fresnel(sd, stack, node.y, node.z, node.w); + break; + case NODE_LAYER_WEIGHT: + svm_node_layer_weight(sd, stack, node); + break; +# if NODES_FEATURE(NODE_FEATURE_VOLUME) + case NODE_CLOSURE_VOLUME: + svm_node_closure_volume(kg, sd, stack, node, path_flag); + break; +# endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */ +# ifdef __EXTRA_NODES__ + case NODE_MATH: + svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset); + break; + case NODE_VECTOR_MATH: + svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset); + break; + case NODE_RGB_RAMP: + svm_node_rgb_ramp(kg, sd, stack, node, &offset); + break; + case NODE_GAMMA: + svm_node_gamma(sd, stack, node.y, node.z, node.w); + break; + case NODE_BRIGHTCONTRAST: + svm_node_brightness(sd, stack, node.y, node.z, node.w); + break; + case NODE_LIGHT_PATH: + svm_node_light_path(sd, stack, node.y, node.z, path_flag); + break; + case NODE_OBJECT_INFO: + svm_node_object_info(kg, sd, stack, node.y, node.z); + break; + case NODE_PARTICLE_INFO: + svm_node_particle_info(kg, sd, stack, node.y, node.z); + break; +# ifdef __HAIR__ +# if NODES_FEATURE(NODE_FEATURE_HAIR) + case NODE_HAIR_INFO: + svm_node_hair_info(kg, sd, stack, node.y, node.z); + break; +# endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */ +# endif /* __HAIR__ */ +# endif /* __EXTRA_NODES__ */ +#endif /* NODES_GROUP(NODE_GROUP_LEVEL_1) */ + +#if NODES_GROUP(NODE_GROUP_LEVEL_2) + case NODE_MAPPING: + svm_node_mapping(kg, sd, stack, node.y, node.z, &offset); + break; + case NODE_MIN_MAX: + svm_node_min_max(kg, sd, stack, node.y, node.z, &offset); + break; + case NODE_CAMERA: + svm_node_camera(kg, sd, stack, node.y, node.z, node.w); + break; +# ifdef __TEXTURES__ case NODE_TEX_ENVIRONMENT: svm_node_tex_environment(kg, sd, stack, node); break; @@ -253,9 +372,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_TEX_GRADIENT: svm_node_tex_gradient(sd, stack, node); break; - case NODE_TEX_NOISE: - svm_node_tex_noise(kg, sd, stack, node, &offset); - break; case NODE_TEX_VORONOI: svm_node_tex_voronoi(kg, sd, stack, node, &offset); break; @@ -274,55 +390,34 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_TEX_BRICK: svm_node_tex_brick(kg, sd, stack, node, &offset); break; -#endif - case NODE_CAMERA: - svm_node_camera(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_GEOMETRY: - svm_node_geometry(kg, sd, stack, node.y, node.z); - break; -#ifdef __EXTRA_NODES__ - case NODE_GEOMETRY_BUMP_DX: - svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); - break; - case NODE_GEOMETRY_BUMP_DY: - svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); - break; - case NODE_LIGHT_PATH: - svm_node_light_path(sd, stack, node.y, node.z, path_flag); - break; - case NODE_OBJECT_INFO: - svm_node_object_info(kg, sd, stack, node.y, node.z); - break; - case NODE_PARTICLE_INFO: - svm_node_particle_info(kg, sd, stack, node.y, node.z); +# endif /* __TEXTURES__ */ +# ifdef __EXTRA_NODES__ + case NODE_NORMAL: + svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset); break; -#ifdef __HAIR__ - case NODE_HAIR_INFO: - svm_node_hair_info(kg, sd, stack, node.y, node.z); + case NODE_LIGHT_FALLOFF: + svm_node_light_falloff(sd, stack, node); break; -#endif +# endif /* __EXTRA_NODES__ */ +#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */ -#endif - case NODE_CONVERT: - svm_node_convert(sd, stack, node.y, node.z, node.w); +#if NODES_GROUP(NODE_GROUP_LEVEL_3) + case NODE_RGB_CURVES: + svm_node_rgb_curves(kg, sd, stack, node, &offset); break; - case NODE_VALUE_F: - svm_node_value_f(kg, sd, stack, node.y, node.z); + case NODE_VECTOR_CURVES: + svm_node_vector_curves(kg, sd, stack, node, &offset); break; - case NODE_VALUE_V: - svm_node_value_v(kg, sd, stack, node.y, &offset); + case NODE_TANGENT: + svm_node_tangent(kg, sd, stack, node); break; -#ifdef __EXTRA_NODES__ + case NODE_NORMAL_MAP: + svm_node_normal_map(kg, sd, stack, node); + break; +# ifdef __EXTRA_NODES__ case NODE_INVERT: svm_node_invert(sd, stack, node.y, node.z, node.w); break; - case NODE_GAMMA: - svm_node_gamma(sd, stack, node.y, node.z, node.w); - break; - case NODE_BRIGHTCONTRAST: - svm_node_brightness(sd, stack, node.y, node.z, node.w); - break; case NODE_MIX: svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset); break; @@ -338,28 +433,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_COMBINE_HSV: svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); break; - case NODE_HSV: - svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); - break; -#endif - case NODE_ATTR: - svm_node_attr(kg, sd, stack, node); - break; -#ifdef __EXTRA_NODES__ - case NODE_ATTR_BUMP_DX: - svm_node_attr_bump_dx(kg, sd, stack, node); - break; - case NODE_ATTR_BUMP_DY: - svm_node_attr_bump_dy(kg, sd, stack, node); - break; -#endif - case NODE_FRESNEL: - svm_node_fresnel(sd, stack, node.y, node.z, node.w); - break; - case NODE_LAYER_WEIGHT: - svm_node_layer_weight(sd, stack, node); + case NODE_VECTOR_TRANSFORM: + svm_node_vector_transform(kg, sd, stack, node); break; -#ifdef __EXTRA_NODES__ case NODE_WIREFRAME: svm_node_wireframe(kg, sd, stack, node); break; @@ -369,70 +445,20 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_BLACKBODY: svm_node_blackbody(kg, sd, stack, node.y, node.z); break; - case NODE_SET_DISPLACEMENT: - svm_node_set_displacement(sd, stack, node.y); - break; - case NODE_SET_BUMP: - svm_node_set_bump(kg, sd, stack, node); - break; - case NODE_MATH: - svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset); - break; - case NODE_VECTOR_MATH: - svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset); - break; - case NODE_VECTOR_TRANSFORM: - svm_node_vector_transform(kg, sd, stack, node); - break; - case NODE_NORMAL: - svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset); - break; -#endif - case NODE_MAPPING: - svm_node_mapping(kg, sd, stack, node.y, node.z, &offset); - break; - case NODE_MIN_MAX: - svm_node_min_max(kg, sd, stack, node.y, node.z, &offset); - break; - case NODE_TEX_COORD: - svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset); - break; -#ifdef __EXTRA_NODES__ - case NODE_TEX_COORD_BUMP_DX: - svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset); - break; - case NODE_TEX_COORD_BUMP_DY: - svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset); - break; - case NODE_CLOSURE_SET_NORMAL: - svm_node_set_normal(kg, sd, stack, node.y, node.z ); - break; - case NODE_RGB_RAMP: - svm_node_rgb_ramp(kg, sd, stack, node, &offset); - break; - case NODE_RGB_CURVES: - svm_node_rgb_curves(kg, sd, stack, node, &offset); - break; - case NODE_VECTOR_CURVES: - svm_node_vector_curves(kg, sd, stack, node, &offset); - break; - case NODE_LIGHT_FALLOFF: - svm_node_light_falloff(sd, stack, node); - break; -#endif - case NODE_TANGENT: - svm_node_tangent(kg, sd, stack, node); - break; - case NODE_NORMAL_MAP: - svm_node_normal_map(kg, sd, stack, node); - break; +# endif /* __EXTRA_NODES__ */ +#endif /* NODES_GROUP(NODE_GROUP_LEVEL_3) */ case NODE_END: + return; default: + kernel_assert(!"Unknown node type was passed to the SVM machine"); return; } } } +#undef NODES_GROUP +#undef NODES_FEATURE + CCL_NAMESPACE_END #endif /* __SVM_H__ */ diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index b63978b6e1f..025ae96f59d 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeType *type, NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { /* find attribute by unique id */ uint id = node.y; - uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; + uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; #ifdef __HAIR__ - attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; + attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; #endif uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index 1e40e868e14..b750ad87b7f 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -36,48 +36,12 @@ CCL_NAMESPACE_BEGIN ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset) { - /* Output */ - float3 color_rgb = make_float3(0.0f, 0.0f, 0.0f); - /* Input */ float temperature = stack_load_float(stack, temperature_offset); - if (temperature < BB_DRAPER) { - /* just return very very dim red */ - color_rgb = make_float3(1.0e-6f,0.0f,0.0f); - } - else if (temperature <= BB_MAX_TABLE_RANGE) { - /* This is the overall size of the table */ - const int lookuptablesize = 956; - const float lookuptablenormalize = 1.0f/956.0f; - - /* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors - just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */ - float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER)); - - int blackbody_table_offset = kernel_data.tables.blackbody_offset; - - /* Retrieve colors from the lookup table */ - float lutval = t*lookuptablenormalize; - float R = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize); - lutval = (t + 319.0f*1.0f)*lookuptablenormalize; - float G = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize); - lutval = (t + 319.0f*2.0f)*lookuptablenormalize; - float B = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize); - - R = powf(R, BB_TABLE_YPOWER); - G = powf(G, BB_TABLE_YPOWER); - B = powf(B, BB_TABLE_YPOWER); - - color_rgb = make_float3(R, G, B); - } - - /* Luminance */ - float l = linear_rgb_to_gray(color_rgb); - if (l != 0.0f) - color_rgb /= l; + float3 color_rgb = svm_math_blackbody_color(temperature); - if (stack_valid(col_offset)) + if(stack_valid(col_offset)) stack_store_float3(stack, col_offset, color_rgb); } diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h index 33a2a5c7598..fcf8f47b77e 100644 --- a/intern/cycles/kernel/svm/svm_brick.h +++ b/intern/cycles/kernel/svm/svm_brick.h @@ -47,7 +47,7 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias, y = p.y - row_height*rownum; return make_float2( - clamp((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias), 0.0f, 1.0f), + saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)), (x < mortar_size || y < mortar_size || x > (brick_width - mortar_size) || diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h index 631bd1825ee..e4d545a00ae 100644 --- a/intern/cycles/kernel/svm/svm_brightness.h +++ b/intern/cycles/kernel/svm/svm_brightness.h @@ -32,7 +32,7 @@ ccl_device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color, color.y = max(a*color.y + b, 0.0f); color.z = max(a*color.z + b, 0.0f); - if (stack_valid(out_color)) + if(stack_valid(out_color)) stack_store_float3(stack, out_color, color); } diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index e03745cb331..00678a49d70 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,17 +23,17 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, sd->P); + vector = transform_point(&tfm, ccl_fetch(sd, P)); zdepth = vector.z; distance = len(vector); - if (stack_valid(out_vector)) + if(stack_valid(out_vector)) stack_store_float3(stack, out_vector, normalize(vector)); - if (stack_valid(out_zdepth)) + if(stack_valid(out_zdepth)) stack_store_float(stack, out_zdepth, zdepth); - if (stack_valid(out_distance)) + if(stack_valid(out_distance)) stack_store_float(stack, out_distance, distance); } diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 07ac7104e68..20a6cb8cd45 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,10 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data0 = eta; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc); + } + else { + sc->data0 = 0.0f; + sc->data1 = 0.0f; + ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc); } - else - sd->flag |= bsdf_reflection_setup(sc); } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { sc->data0 = roughness; @@ -36,9 +39,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data2 = eta; if(refract) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_beckmann_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc); } else { sc->data0 = roughness; @@ -46,23 +49,23 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data2 = eta; if(refract) - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_ggx_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc); } } ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); - if(sd->num_closure < MAX_CLOSURE) { + if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight *= mix_weight; sc->type = type; #ifdef __OSL__ sc->prim = NULL; #endif - sd->num_closure++; + ccl_fetch(sd, num_closure)++; return sc; } @@ -71,14 +74,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + float3 weight = sc->weight * mix_weight; float sample_weight = fabsf(average(weight)); - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight = weight; sc->sample_weight = sample_weight; - sd->num_closure++; + ccl_fetch(sd, num_closure)++; #ifdef __OSL__ sc->prim = NULL; #endif @@ -90,14 +94,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight; float sample_weight = fabsf(average(weight)); - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight = weight; sc->sample_weight = sample_weight; - sd->num_closure++; + ccl_fetch(sd, num_closure)++; #ifdef __OSL__ sc->prim = NULL; #endif @@ -121,7 +126,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(mix_weight == 0.0f) return; - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); @@ -139,13 +144,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data0 = 0.0f; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_diffuse_setup(sc); + ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc); } else { sc->data0 = roughness; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_oren_nayar_setup(sc); + ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc); } } break; @@ -158,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; sc->N = N; - sd->flag |= bsdf_translucent_setup(sc); + ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc); } break; } @@ -170,7 +175,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; sc->N = N; - sd->flag |= bsdf_transparent_setup(sc); + ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc); } break; } @@ -192,13 +197,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) - sd->flag |= bsdf_reflection_setup(sc); + ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - sd->flag |= bsdf_microfacet_beckmann_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - sd->flag |= bsdf_microfacet_ggx_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc); else - sd->flag |= bsdf_ashikhmin_shirley_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc); } break; @@ -216,7 +221,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; float eta = fmaxf(param2, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -224,7 +229,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc); } else { sc->data0 = param1; @@ -232,9 +237,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data2 = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc); } } @@ -251,15 +256,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, sd->I); + float cosNO = dot(N, ccl_fetch(sd, I)); float fresnel = fresnel_dielectric_cos(cosNO, eta); float roughness = param1; /* reflection */ - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); float3 weight = sc->weight; float sample_weight = sc->sample_weight; @@ -280,15 +285,17 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif /* refraction */ - sc = &sd->closure[sd->num_closure]; - sc->weight = weight; - sc->sample_weight = sample_weight; + if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) { + sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + sc->weight = weight; + sc->sample_weight = sample_weight; - sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel)); + sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel)); - if(sc) { - sc->N = N; - svm_node_glass_setup(sd, sc, type, eta, roughness, true); + if(sc) { + sc->N = N; + svm_node_glass_setup(sd, sc, type, eta, roughness, true); + } } break; @@ -328,12 +335,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data2 = 0.0f; - if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) - sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc); - else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) - sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc); + if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc); + else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc); else - sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc); } break; } @@ -344,10 +351,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; /* sigma */ - sc->data0 = clamp(param1, 0.0f, 1.0f); + sc->data0 = saturate(param1); sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_ashikhmin_velvet_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc); } break; } @@ -362,10 +369,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = param2; sc->data2 = 0.0f; - if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - sd->flag |= bsdf_diffuse_toon_setup(sc); + if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) + ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc); else - sd->flag |= bsdf_glossy_toon_setup(sc); + ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc); } break; } @@ -373,7 +380,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { + if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { @@ -384,11 +391,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * * spawned by transmission from the front */ sc->weight = make_float3(1.0f, 1.0f, 1.0f); sc->N = N; - sd->flag |= bsdf_transparent_setup(sc); + sc->data0 = 0.0f; + sc->data1 = 0.0f; + ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc); } } else { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { @@ -397,18 +406,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = param2; sc->data2 = -stack_load_float(stack, data_node.z); - if(!(sd->type & PRIMITIVE_ALL_CURVE)) { - sc->T = normalize(sd->dPdv); + if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { + sc->T = normalize(ccl_fetch(sd, dPdv)); sc->data2 = 0.0f; } else - sc->T = normalize(sd->dPdu); + sc->T = normalize(ccl_fetch(sd, dPdu)); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - sd->flag |= bsdf_hair_reflection_setup(sc); + ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc); } else { - sd->flag |= bsdf_hair_transmission_setup(sc); + ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc); } } } @@ -418,9 +427,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif #ifdef __SUBSURFACE__ +#ifndef __SPLIT_KERNEL__ +# define sc_next(sc) sc++ +# else +# define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)) +# endif case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); float3 weight = sc->weight * mix_weight; float sample_weight = fabsf(average(weight)); @@ -430,7 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) param1 = 0.0f; - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) { /* radius * scale */ float3 radius = stack_load_float3(stack, data_node.z)*param1; /* sharpness */ @@ -450,10 +464,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } if(fabsf(weight.y) > 0.0f) { @@ -467,10 +481,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } if(fabsf(weight.z) > 0.0f) { @@ -484,15 +498,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } } break; } +# undef sc_next #endif default: break; @@ -520,7 +535,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density); if(sc) { - sd->flag |= volume_absorption_setup(sc); + ccl_fetch(sd, flag) |= volume_absorption_setup(sc); } break; } @@ -529,7 +544,8 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float if(sc) { sc->data0 = param2; /* g */ - sd->flag |= volume_henyey_greenstein_setup(sc); + sc->data1 = 0.0f; + ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc); } break; } @@ -554,7 +570,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no else svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f); - sd->flag |= SD_EMISSION; + ccl_fetch(sd, flag) |= SD_EMISSION; } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) @@ -588,7 +604,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod else svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f); - sd->flag |= SD_HOLDOUT; + ccl_fetch(sd, flag) |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -606,15 +622,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, else svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f); - sd->flag |= SD_AO; + ccl_fetch(sd, flag) |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - if(sd->num_closure < MAX_CLOSURE) - sd->closure[sd->num_closure].weight = weight; + if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + sc->weight = weight; + } } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -649,7 +667,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset); float weight = stack_load_float(stack, weight_offset); - weight = clamp(weight, 0.0f, 1.0f); + weight = saturate(weight); float in_weight = (stack_valid(in_weight_offset))? stack_load_float(stack, in_weight_offset): 1.0f; @@ -664,7 +682,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - sd->N = normal; + ccl_fetch(sd, N) = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 4a058905a93..8d4b07c9973 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac uint normal_offset, distance_offset, invert; decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); /* get surface tangents from normal */ - float3 Rx = cross(sd->dP.dy, normal_in); - float3 Ry = cross(normal_in, sd->dP.dx); + float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in); + float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx); /* get bump values */ uint c_offset, x_offset, y_offset, strength_offset; @@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac float h_y = stack_load_float(stack, y_offset); /* compute surface gradient and determinant */ - float det = dot(sd->dP.dx, Rx); + float det = dot(ccl_fetch(sd, dP).dx, Rx); float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry; float absdet = fabsf(det); @@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset) { float d = stack_load_float(stack, fac_offset); - sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */ + ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 3703ec55015..23c97d80cb0 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); eta = fmaxf(eta, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); + float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); + f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); } else { - f = fabsf(dot(sd->I, normal_in)); + f = fabsf(dot(ccl_fetch(sd, I), normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h index 8bc59b9c673..b645ff3f0f9 100644 --- a/intern/cycles/kernel/svm/svm_gamma.h +++ b/intern/cycles/kernel/svm/svm_gamma.h @@ -21,14 +21,14 @@ ccl_device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint float3 color = stack_load_float3(stack, in_color); float gamma = stack_load_float(stack, in_gamma); - if (color.x > 0.0f) + if(color.x > 0.0f) color.x = powf(color.x, gamma); - if (color.y > 0.0f) + if(color.y > 0.0f) color.y = powf(color.y, gamma); - if (color.z > 0.0f) + if(color.z > 0.0f) color.z = powf(color.z, gamma); - if (stack_valid(out_color)) + if(stack_valid(out_color)) stack_store_float3(stack, out_color, color); } diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index efbefa77d28..bb06254c3a9 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P; break; - case NODE_GEOM_N: data = sd->N; break; + case NODE_GEOM_P: data = ccl_fetch(sd, P); break; + case NODE_GEOM_N: data = ccl_fetch(sd, N); break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = sd->I; break; - case NODE_GEOM_Ng: data = sd->Ng; break; + case NODE_GEOM_I: data = ccl_fetch(sd, I); break; + case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; #endif } @@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; - case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; + case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; - case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; + case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break; + case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; default: data = 0.0f; break; } @@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; + data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } @@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta break; } /*case NODE_INFO_CURVE_FADE: { - data = sd->curve_transparency; + data = ccl_fetch(sd, curve_transparency); stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h index a5e385faddc..53d7b4f812c 100644 --- a/intern/cycles/kernel/svm/svm_gradient.h +++ b/intern/cycles/kernel/svm/svm_gradient.h @@ -66,7 +66,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node) float3 co = stack_load_float3(stack, co_offset); float f = svm_gradient(co, (NodeGradientType)type); - f = clamp(f, 0.0f, 1.0f); + f = saturate(f); if(stack_valid(fac_offset)) stack_store_float(stack, fac_offset, f); diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h index eeb4ba25e91..1f2cad60df7 100644 --- a/intern/cycles/kernel/svm/svm_hsv.h +++ b/intern/cycles/kernel/svm/svm_hsv.h @@ -46,12 +46,12 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui color.y = fac*color.y + (1.0f - fac)*in_color.y; color.z = fac*color.z + (1.0f - fac)*in_color.z; - /* Clamp color to prevent negative values cauzed by oversaturation. */ + /* Clamp color to prevent negative values caused by oversaturation. */ color.x = max(color.x, 0.0f); color.y = max(color.y, 0.0f); color.z = max(color.z, 0.0f); - if (stack_valid(out_color_offset)) + if(stack_valid(out_color_offset)) stack_store_float3(stack, out_color_offset, color); } diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 4de69479bd9..caf0b37ba35 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -65,7 +65,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, float4 r; int ix, iy, nix, niy; - if (interpolation == INTERPOLATION_CLOSEST) { + if(interpolation == INTERPOLATION_CLOSEST) { svm_image_texture_frac(x*width, &ix); svm_image_texture_frac(y*height, &iy); @@ -251,9 +251,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break; case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break; case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break; - case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break; case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break; @@ -392,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = sd->N; + float3 N = ccl_fetch(sd, N); - N = sd->N; - if(sd->object != OBJECT_NONE) + N = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ @@ -433,17 +433,17 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float /* in case of blending, test for mixes between two textures */ if(N.z < (1.0f - limit)*(N.y + N.x)) { weight.x = N.x/(N.x + N.y); - weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f); + weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend); weight.y = 1.0f - weight.x; } else if(N.x < (1.0f - limit)*(N.y + N.z)) { weight.y = N.y/(N.y + N.z); - weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f); + weight.y = saturate((weight.y - 0.5f*(1.0f - blend))/blend); weight.z = 1.0f - weight.y; } else if(N.y < (1.0f - limit)*(N.x + N.z)) { weight.x = N.x/(N.x + N.z); - weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f); + weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend); weight.z = 1.0f - weight.x; } else { diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h index 152b49174e0..5ce858e2e5d 100644 --- a/intern/cycles/kernel/svm/svm_invert.h +++ b/intern/cycles/kernel/svm/svm_invert.h @@ -30,7 +30,7 @@ ccl_device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint color.y = invert(color.y, factor); color.z = invert(color.z, factor); - if (stack_valid(out_color)) + if(stack_valid(out_color)) stack_store_float3(stack, out_color, color); } diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index 677d139c5d4..a235dd35224 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = sd->ray_length; break; - case NODE_LP_ray_depth: info = (float)sd->ray_depth; break; - case NODE_LP_ray_transparent: info = sd->transparent_depth; break; + case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break; + case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break; } stack_store_float(stack, out_offset, info); @@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = sd->ray_length*sd->ray_length; + float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); strength *= squared/(smooth + squared); } diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h index 39cc14d5e8e..645cbd3fc73 100644 --- a/intern/cycles/kernel/svm/svm_math_util.h +++ b/intern/cycles/kernel/svm/svm_math_util.h @@ -97,12 +97,74 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2) else if(type == NODE_MATH_ABSOLUTE) Fac = fabsf(Fac1); else if(type == NODE_MATH_CLAMP) - Fac = clamp(Fac1, 0.0f, 1.0f); + Fac = saturate(Fac1); else Fac = 0.0f; return Fac; } +ccl_device float3 svm_math_blackbody_color(float t) { + /* Calculate color in range 800..12000 using an approximation + * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B + * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), + * which is enough to get the same 8 bit/channel color. + */ + + const float rc[6][3] = { + { 2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f }, + { 3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f }, + { 4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f }, + { 4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f }, + { 4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f }, + { 3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f }, + }; + + const float gc[6][3] = { + { -7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f }, + { -1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f }, + { -1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f }, + { -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f }, + { -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f }, + { -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f }, + }; + + const float bc[6][4] = { + { 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */ + { 0.0f, 0.0f, 0.0f, 0.0f }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + { -2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f }, + { -2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f }, + { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, + }; + + if(t >= 12000.0f) + return make_float3(0.826270103f, 0.994478524f, 1.56626022f); + + /* Define a macro to reduce stack usage for nvcc */ +#define MAKE_BB_RGB(i) make_float3(\ + rc[i][0] / t + rc[i][1] * t + rc[i][2],\ + gc[i][0] / t + gc[i][1] * t + gc[i][2],\ + ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]) + + if(t >= 6365.0f) + return MAKE_BB_RGB(5); + if(t >= 3315.0f) + return MAKE_BB_RGB(4); + if(t >= 1902.0f) + return MAKE_BB_RGB(3); + if(t >= 1449.0f) + return MAKE_BB_RGB(2); + if(t >= 1167.0f) + return MAKE_BB_RGB(1); + if(t >= 965.0f) + return MAKE_BB_RGB(0); + +#undef MAKE_BB_RGB + + /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ + return make_float3(4.70366907f, 0.0f, 0.0f); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h index b6b1966cb3b..6111214acba 100644 --- a/intern/cycles/kernel/svm/svm_mix.h +++ b/intern/cycles/kernel/svm/svm_mix.h @@ -254,16 +254,16 @@ ccl_device float3 svm_mix_clamp(float3 col) { float3 outcol = col; - outcol.x = clamp(col.x, 0.0f, 1.0f); - outcol.y = clamp(col.y, 0.0f, 1.0f); - outcol.z = clamp(col.z, 0.0f, 1.0f); + outcol.x = saturate(col.x); + outcol.y = saturate(col.y); + outcol.z = saturate(col.z); return outcol; } ccl_device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2) { - float t = clamp(fac, 0.0f, 1.0f); + float t = saturate(fac); switch(type) { case NODE_MIX_BLEND: return svm_mix_blend(t, c1, c2); diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h index 2f81ddaa74c..09eba31945e 100644 --- a/intern/cycles/kernel/svm/svm_musgrave.h +++ b/intern/cycles/kernel/svm/svm_musgrave.h @@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN * from "Texturing and Modelling: A procedural approach" */ -ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves) +ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves) { float rmd; float value = 0.0f; @@ -53,7 +53,7 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, flo * octaves: number of frequencies in the fBm */ -ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves) +ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, float H, float lacunarity, float octaves) { float rmd; float value = 1.0f; @@ -82,7 +82,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset) +ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, float H, float lacunarity, float octaves, float offset) { float value, increment, rmd; float pwHL = powf(lacunarity, -H); @@ -117,7 +117,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain) +ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain) { float result, signal, weight, rmd; float pwHL = powf(lacunarity, -H); @@ -154,7 +154,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNois * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain) +ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain) { float result, signal, weight; float pwHL = powf(lacunarity, -H); @@ -168,7 +168,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois for(i = 1; i < float_to_int(octaves); i++) { p *= lacunarity; - weight = clamp(signal * gain, 0.0f, 1.0f); + weight = saturate(signal * gain); signal = offset - fabsf(snoise(p)); signal *= signal; signal *= weight; @@ -183,18 +183,16 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois ccl_device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float3 p) { - NodeNoiseBasis basis = NODE_NOISE_PERLIN; - if(type == NODE_MUSGRAVE_MULTIFRACTAL) - return intensity*noise_musgrave_multi_fractal(p, basis, dimension, lacunarity, octaves); + return intensity*noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves); else if(type == NODE_MUSGRAVE_FBM) - return intensity*noise_musgrave_fBm(p, basis, dimension, lacunarity, octaves); + return intensity*noise_musgrave_fBm(p, dimension, lacunarity, octaves); else if(type == NODE_MUSGRAVE_HYBRID_MULTIFRACTAL) - return intensity*noise_musgrave_hybrid_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain); + return intensity*noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, offset, gain); else if(type == NODE_MUSGRAVE_RIDGED_MULTIFRACTAL) - return intensity*noise_musgrave_ridged_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain); + return intensity*noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, offset, gain); else if(type == NODE_MUSGRAVE_HETERO_TERRAIN) - return intensity*noise_musgrave_hetero_terrain(p, basis, dimension, lacunarity, octaves, offset); + return intensity*noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, offset); return 0.0f; } diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h index eccd119b74f..62ff38cf1c5 100644 --- a/intern/cycles/kernel/svm/svm_noisetex.h +++ b/intern/cycles/kernel/svm/svm_noisetex.h @@ -20,23 +20,22 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color) { - NodeNoiseBasis basis = NODE_NOISE_PERLIN; int hard = 0; if(distortion != 0.0f) { float3 r, offset = make_float3(13.5f, 13.5f, 13.5f); - r.x = noise_basis(p + offset, basis) * distortion; - r.y = noise_basis(p, basis) * distortion; - r.z = noise_basis(p - offset, basis) * distortion; + r.x = noise(p + offset) * distortion; + r.y = noise(p) * distortion; + r.z = noise(p - offset) * distortion; p += r; } - *fac = noise_turbulence(p, basis, detail, hard); + *fac = noise_turbulence(p, detail, hard); *color = make_float3(*fac, - noise_turbulence(make_float3(p.y, p.x, p.z), basis, detail, hard), - noise_turbulence(make_float3(p.y, p.z, p.x), basis, detail, hard)); + noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard), + noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard)); } ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h index 67b5321e0de..53abef71012 100644 --- a/intern/cycles/kernel/svm/svm_normal.h +++ b/intern/cycles/kernel/svm/svm_normal.h @@ -28,10 +28,10 @@ ccl_device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack, direction.z = __int_as_float(node1.z); direction = normalize(direction); - if (stack_valid(out_normal_offset)) + if(stack_valid(out_normal_offset)) stack_store_float3(stack, out_normal_offset, direction); - if (stack_valid(out_dot_offset)) + if(stack_valid(out_dot_offset)) stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal))); } diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index 998f649a571..062ab013b1f 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate) { - f = clamp(f, 0.0f, 1.0f)*(RAMP_TABLE_SIZE-1); + f = saturate(f)*(RAMP_TABLE_SIZE-1); /* clamp int as well in case of NaN */ int i = clamp(float_to_int(f), 0, RAMP_TABLE_SIZE-1); diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h index 68f9fea02f0..6f51b163756 100644 --- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h +++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h @@ -28,7 +28,7 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *s /* Combine, and convert back to RGB */ float3 color = hsv_to_rgb(make_float3(hue, saturation, value)); - if (stack_valid(color_out)) + if(stack_valid(color_out)) stack_store_float3(stack, color_out, color); } @@ -42,11 +42,11 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float * /* Convert to HSV */ color = rgb_to_hsv(color); - if (stack_valid(hue_out)) + if(stack_valid(hue_out)) stack_store_float(stack, hue_out, color.x); - if (stack_valid(saturation_out)) + if(stack_valid(saturation_out)) stack_store_float(stack, saturation_out, color.y); - if (stack_valid(value_out)) + if(stack_valid(value_out)) stack_store_float(stack, value_out, color.z); } diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h index 7a5a69f6dff..63570dd6942 100644 --- a/intern/cycles/kernel/svm/svm_sepcomb_vector.h +++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h @@ -22,7 +22,7 @@ ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_of { float vector = stack_load_float(stack, in_offset); - if (stack_valid(out_offset)) + if(stack_valid(out_offset)) stack_store_float(stack, out_offset+vector_index, vector); } @@ -30,10 +30,10 @@ ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivec { float3 vector = stack_load_float3(stack, ivector_offset); - if (stack_valid(out_offset)) { - if (vector_index == 0) + if(stack_valid(out_offset)) { + if(vector_index == 0) stack_store_float(stack, out_offset, vector.x); - else if (vector_index == 1) + else if(vector_index == 1) stack_store_float(stack, out_offset, vector.y); else stack_store_float(stack, out_offset, vector.z); diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index a399acf3c0f..eebd9bee420 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P; + data = ccl_fetch(sd, P); if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -48,48 +48,48 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P)); else - data = transform_point(&tfm, sd->P + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); else - data = camera_world_to_ndc(kg, sd, sd->P); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P; + data = ccl_fetch(sd, P); #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -113,9 +113,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P + sd->dP.dx; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -130,48 +130,48 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P + sd->dP.dx); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); else - data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); else - data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P + sd->dP.dx; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -198,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P + sd->dP.dy; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -215,48 +215,48 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P + sd->dP.dy); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); else - data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); else - data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P + sd->dP.dy; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -281,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(sd->object == OBJECT_NONE) { + if(ccl_fetch(sd, object) == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -302,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL); float3 normal; - if(sd->shader & SHADER_SMOOTH_NORMAL) { + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL); } else { - normal = sd->Ng; + normal = ccl_fetch(sd, Ng); object_inverse_normal_transform(kg, sd, &normal); } @@ -337,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = normalize(sd->N + (N - sd->N)*strength); + N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); } stack_store_float3(stack, normal_offset, N); @@ -367,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(attr_offset == ATTR_STD_NOT_FOUND) - generated = sd->P; + generated = ccl_fetch(sd, P); else generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL); @@ -380,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(sd->N, normalize(cross(tangent, sd->N))); + tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index c5dc213c82d..dcb00f7dd55 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -16,261 +16,9 @@ CCL_NAMESPACE_BEGIN -/* Voronoi Distances */ - -#if 0 -ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e) -{ -#if 0 - if(distance_metric == NODE_VORONOI_DISTANCE_SQUARED) -#endif - return dot(d, d); -#if 0 - if(distance_metric == NODE_VORONOI_ACTUAL_DISTANCE) - return len(d); - if(distance_metric == NODE_VORONOI_MANHATTAN) - return fabsf(d.x) + fabsf(d.y) + fabsf(d.z); - if(distance_metric == NODE_VORONOI_CHEBYCHEV) - return fmaxf(fabsf(d.x), fmaxf(fabsf(d.y), fabsf(d.z))); - if(distance_metric == NODE_VORONOI_MINKOVSKY_H) - return sqrtf(fabsf(d.x)) + sqrtf(fabsf(d.y)) + sqrtf(fabsf(d.y)); - if(distance_metric == NODE_VORONOI_MINKOVSKY_4) - return sqrtf(sqrtf(dot(d*d, d*d))); - if(distance_metric == NODE_VORONOI_MINKOVSKY) - return powf(powf(fabsf(d.x), e) + powf(fabsf(d.y), e) + powf(fabsf(d.z), e), 1.0f/e); - - return 0.0f; -#endif -} - -/* Voronoi / Worley like */ -ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2) -{ - float da[4]; - float3 pa[4]; - NodeDistanceMetric distance_metric = NODE_VORONOI_DISTANCE_SQUARED; - - /* returns distances in da and point coords in pa */ - int xx, yy, zz, xi, yi, zi; - - xi = floor_to_int(p.x); - yi = floor_to_int(p.y); - zi = floor_to_int(p.z); - - da[0] = 1e10f; - da[1] = 1e10f; - da[2] = 1e10f; - da[3] = 1e10f; - - pa[0] = make_float3(0.0f, 0.0f, 0.0f); - pa[1] = make_float3(0.0f, 0.0f, 0.0f); - pa[2] = make_float3(0.0f, 0.0f, 0.0f); - pa[3] = make_float3(0.0f, 0.0f, 0.0f); - - for(xx = xi-1; xx <= xi+1; xx++) { - for(yy = yi-1; yy <= yi+1; yy++) { - for(zz = zi-1; zz <= zi+1; zz++) { - float3 ip = make_float3((float)xx, (float)yy, (float)zz); - float3 vp = cellnoise_color(ip); - float3 pd = p - (vp + ip); - float d = voronoi_distance(distance_metric, pd, e); - - vp += ip; - - if(d < da[0]) { - da[3] = da[2]; - da[2] = da[1]; - da[1] = da[0]; - da[0] = d; - - pa[3] = pa[2]; - pa[2] = pa[1]; - pa[1] = pa[0]; - pa[0] = vp; - } - else if(d < da[1]) { - da[3] = da[2]; - da[2] = da[1]; - da[1] = d; - - pa[3] = pa[2]; - pa[2] = pa[1]; - pa[1] = vp; - } - else if(d < da[2]) { - da[3] = da[2]; - da[2] = d; - - pa[3] = pa[2]; - pa[2] = vp; - } - else if(d < da[3]) { - da[3] = d; - pa[3] = vp; - } - } - } - } - - float4 result = make_float4(pa[n1].x, pa[n1].y, pa[n1].z, da[n1]); - - if(n2 != -1) - result = make_float4(pa[n2].x, pa[n2].y, pa[n2].z, da[n2]) - result; - - return result; -} -#endif - -ccl_device float voronoi_F1_distance(float3 p) -{ - /* returns squared distance in da */ - float da = 1e10f; - -#ifndef __KERNEL_SSE2__ - int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); - - for (int xx = -1; xx <= 1; xx++) { - for (int yy = -1; yy <= 1; yy++) { - for (int zz = -1; zz <= 1; zz++) { - float3 ip = make_float3(ix + xx, iy + yy, iz + zz); - float3 vp = ip + cellnoise_color(ip); - float d = len_squared(p - vp); - da = min(d, da); - } - } - } -#else - ssef vec_p = load4f(p); - ssei xyzi = quick_floor_sse(vec_p); - - for (int xx = -1; xx <= 1; xx++) { - for (int yy = -1; yy <= 1; yy++) { - for (int zz = -1; zz <= 1; zz++) { - ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); - ssef vp = ip + cellnoise_color(ip); - float d = len_squared<1, 1, 1, 0>(vec_p - vp); - da = min(d, da); - } - } - } -#endif - - return da; -} - -ccl_device float3 voronoi_F1_color(float3 p) -{ - /* returns color of the nearest point */ - float da = 1e10f; - -#ifndef __KERNEL_SSE2__ - float3 pa; - int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); - - for (int xx = -1; xx <= 1; xx++) { - for (int yy = -1; yy <= 1; yy++) { - for (int zz = -1; zz <= 1; zz++) { - float3 ip = make_float3(ix + xx, iy + yy, iz + zz); - float3 vp = ip + cellnoise_color(ip); - float d = len_squared(p - vp); - - if(d < da) { - da = d; - pa = vp; - } - } - } - } - - return cellnoise_color(pa); -#else - ssef pa, vec_p = load4f(p); - ssei xyzi = quick_floor_sse(vec_p); - - for (int xx = -1; xx <= 1; xx++) { - for (int yy = -1; yy <= 1; yy++) { - for (int zz = -1; zz <= 1; zz++) { - ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); - ssef vp = ip + cellnoise_color(ip); - float d = len_squared<1, 1, 1, 0>(vec_p - vp); - - if(d < da) { - da = d; - pa = vp; - } - } - } - } - - ssef color = cellnoise_color(pa); - return (float3 &)color; -#endif -} - -#if 0 -ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; } -ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; } -ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; } -ccl_device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; } -ccl_device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; } - -ccl_device float voronoi_Cr(float3 p) -{ - /* crackle type pattern, just a scale/clamp of F2-F1 */ - float t = 10.0f*voronoi_F1F2(p); - return (t > 1.0f)? 1.0f: t; -} - -ccl_device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; } -ccl_device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; } -ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; } -ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; } -ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; } -ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; } -#endif - -/* Noise Bases */ - -ccl_device float noise_basis(float3 p, NodeNoiseBasis basis) -{ - /* Only Perlin enabled for now, others break CUDA compile by making kernel - * too big, with compile using > 4GB, due to everything being inlined. */ - -#if 0 - if(basis == NODE_NOISE_PERLIN) -#endif - return noise(p); -#if 0 - if(basis == NODE_NOISE_VORONOI_F1) - return voronoi_F1S(p); - if(basis == NODE_NOISE_VORONOI_F2) - return voronoi_F2S(p); - if(basis == NODE_NOISE_VORONOI_F3) - return voronoi_F3S(p); - if(basis == NODE_NOISE_VORONOI_F4) - return voronoi_F4S(p); - if(basis == NODE_NOISE_VORONOI_F2_F1) - return voronoi_F1F2S(p); - if(basis == NODE_NOISE_VORONOI_CRACKLE) - return voronoi_CrS(p); - if(basis == NODE_NOISE_CELL_NOISE) - return cellnoise(p); - - return 0.0f; -#endif -} - -/* Soft/Hard Noise */ - -ccl_device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard) -{ - float t = noise_basis(p, basis); - return (hard)? fabsf(2.0f*t - 1.0f): t; -} - /* Turbulence */ -ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard) +ccl_device_noinline float noise_turbulence(float3 p, float octaves, int hard) { float fscale = 1.0f; float amp = 1.0f; @@ -281,7 +29,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float n = float_to_int(octaves); for(i = 0; i <= n; i++) { - float t = noise_basis(fscale*p, basis); + float t = noise(fscale*p); if(hard) t = fabsf(2.0f*t - 1.0f); @@ -294,7 +42,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float float rmd = octaves - floorf(octaves); if(rmd != 0.0f) { - float t = noise_basis(fscale*p, basis); + float t = noise(fscale*p); if(hard) t = fabsf(2.0f*t - 1.0f); diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 7130b14a426..009e91192eb 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -28,6 +28,29 @@ CCL_NAMESPACE_BEGIN /* Nodes */ +/* Known frequencies of used nodes, used for selective nodes compilation + * in the kernel. Currently only affects split OpenCL kernel. + * + * Keep as defines so it's easy to check which nodes are to be compiled + * from preprocessor. + * + * Lower the number of group more often the node is used. + */ +#define NODE_GROUP_LEVEL_0 0 +#define NODE_GROUP_LEVEL_1 1 +#define NODE_GROUP_LEVEL_2 2 +#define NODE_GROUP_LEVEL_3 3 +#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_3 + +#define NODE_FEATURE_VOLUME (1 << 0) +#define NODE_FEATURE_HAIR (1 << 1) +#define NODE_FEATURE_BUMP (1 << 2) +/* TODO(sergey): Consider using something like ((uint)(-1)). + * Need to ceck carefully operand types around usage of this + * define first. + */ +#define NODE_FEATURE_ALL (NODE_FEATURE_VOLUME|NODE_FEATURE_HAIR|NODE_FEATURE_BUMP) + typedef enum NodeType { NODE_END = 0, NODE_CLOSURE_BSDF, @@ -256,27 +279,6 @@ typedef enum NodeConvert { NODE_CONVERT_IV } NodeConvert; -typedef enum NodeDistanceMetric { - NODE_VORONOI_DISTANCE_SQUARED, - NODE_VORONOI_ACTUAL_DISTANCE, - NODE_VORONOI_MANHATTAN, - NODE_VORONOI_CHEBYCHEV, - NODE_VORONOI_MINKOVSKY_H, - NODE_VORONOI_MINKOVSKY_4, - NODE_VORONOI_MINKOVSKY -} NodeDistanceMetric; - -typedef enum NodeNoiseBasis { - NODE_NOISE_PERLIN, - NODE_NOISE_VORONOI_F1, - NODE_NOISE_VORONOI_F2, - NODE_NOISE_VORONOI_F3, - NODE_NOISE_VORONOI_F4, - NODE_NOISE_VORONOI_F2_F1, - NODE_NOISE_VORONOI_CRACKLE, - NODE_NOISE_CELL_NOISE -} NodeNoiseBasis; - typedef enum NodeMusgraveType { NODE_MUSGRAVE_MULTIFRACTAL, NODE_MUSGRAVE_FBM, @@ -426,6 +428,7 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index a16786f3ed3..4c32130d06d 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (sd->object != OBJECT_NONE); + bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ @@ -45,7 +45,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo else in = transform_point(&tfm, in); } - else if (to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) { + else if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) { if(is_direction) object_inverse_dir_transform(kg, sd, &in); else @@ -54,7 +54,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo } /* From camera */ - else if (from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) { + else if(from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) { if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_WORLD || to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT) { tfm = kernel_data.cam.cameratoworld; if(is_direction) diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h index 5a2e6e97dd3..d612d7e973f 100644 --- a/intern/cycles/kernel/svm/svm_voronoi.h +++ b/intern/cycles/kernel/svm/svm_voronoi.h @@ -18,6 +18,92 @@ CCL_NAMESPACE_BEGIN /* Voronoi */ +ccl_device float voronoi_F1_distance(float3 p) +{ + /* returns squared distance in da */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for(int xx = -1; xx <= 1; xx++) { + for(int yy = -1; yy <= 1; yy++) { + for(int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + da = min(d, da); + } + } + } +#else + ssef vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); + + for(int xx = -1; xx <= 1; xx++) { + for(int yy = -1; yy <= 1; yy++) { + for(int zz = -1; zz <= 1; zz++) { + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); + da = min(d, da); + } + } + } +#endif + + return da; +} + +ccl_device float3 voronoi_F1_color(float3 p) +{ + /* returns color of the nearest point */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + float3 pa; + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for(int xx = -1; xx <= 1; xx++) { + for(int yy = -1; yy <= 1; yy++) { + for(int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + return cellnoise_color(pa); +#else + ssef pa, vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); + + for(int xx = -1; xx <= 1; xx++) { + for(int yy = -1; yy <= 1; yy++) { + for(int zz = -1; zz <= 1; zz++) { + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + ssef color = cellnoise_color(pa); + return (float3 &)color; +#endif +} + ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p) { if(coloring == NODE_VORONOI_INTENSITY) { diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h index 36b59c3684c..6eaddaf301c 100644 --- a/intern/cycles/kernel/svm/svm_wave.h +++ b/intern/cycles/kernel/svm/svm_wave.h @@ -28,7 +28,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, float3 p, float detail, fl n = len(p) * 20.0f; if(distortion != 0.0f) - n += distortion * noise_turbulence(p*dscale, NODE_NOISE_PERLIN, detail, 0); + n += distortion * noise_turbulence(p*dscale, detail, 0); return 0.5f + 0.5f * sinf(n); } diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h index 9e57c470c0f..57030f3979d 100644 --- a/intern/cycles/kernel/svm/svm_wavelength.h +++ b/intern/cycles/kernel/svm/svm_wavelength.h @@ -77,7 +77,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt int i = float_to_int(ii); float3 color; - if (i < 0 || i >= 80) { + if(i < 0 || i >= 80) { color = make_float3(0.0f, 0.0f, 0.0f); } else { diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 42fe3e8e429..30ccd523add 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) + if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) #else - if (sd->prim != PRIM_NONE) + if(ccl_fetch(sd, prim) != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(sd->type & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, sd->prim, Co); + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, ccl_fetch(sd, prim), Co); else - motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); + motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); - if(!(sd->flag & SD_TRANSFORM_APPLIED)) { + if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); - float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); + float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -76,7 +76,7 @@ ccl_device float wireframe(KernelGlobals *kg, // other half. And take the square for fast comparison pixelwidth *= 0.5f * size; pixelwidth *= pixelwidth; - for (int i = 0; i < np; i++) { + for(int i = 0; i < np; i++) { int i2 = i ? i - 1 : np - 1; float3 dir = *P - Co[i]; float3 edge = Co[i] - Co[i2]; @@ -84,7 +84,7 @@ ccl_device float wireframe(KernelGlobals *kg, // At this point dot(crs, crs) / dot(edge, edge) is // the square of area / length(edge) == square of the // distance to the edge. - if (dot(crs, crs) < (dot(edge, edge) * pixelwidth)) + if(dot(crs, crs) < (dot(edge, edge) * pixelwidth)) return 1.0f; } } @@ -106,19 +106,30 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, int pixel_size = (int)use_pixel_size; /* Calculate wireframe */ - float f = wireframe(kg, sd, size, pixel_size, &sd->P); +#ifdef __SPLIT_KERNEL__ + /* TODO(sergey): This is because sd is actually a global space, + * which makes it difficult to re-use same wireframe() function. + * + * With OpenCL 2.0 it's possible to avoid this change, but for until + * then we'll be living with such an exception. + */ + float3 P = ccl_fetch(sd, P); + float f = wireframe(kg, sd, size, pixel_size, &P); +#else + float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); +#endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = sd->P - sd->dP.dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); + float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); } - else if (bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = sd->P - sd->dP.dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); + else if(bump_offset == NODE_BUMP_OFFSET_DY) { + float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); } - if (stack_valid(out_fac)) + if(stack_valid(out_fac)) stack_store_float(stack, out_fac, f); } |