diff options
author | Daniel Genrich <daniel.genrich@gmx.net> | 2014-10-23 17:12:28 +0400 |
---|---|---|
committer | Daniel Genrich <daniel.genrich@gmx.net> | 2014-10-23 17:12:28 +0400 |
commit | 9ff1ebed52e0f858a395eeea4caf89304e068b2d (patch) | |
tree | b05d0f4b229de61b088a128ad412dd7bba347928 /intern/cycles/kernel | |
parent | a2ed11c6eeab5fab8cb81e32e1c68fdafdd5dbbc (diff) | |
parent | eaaeae469968c5c78a5d7e6d202f1af00b382a79 (diff) |
Merge remote-tracking branch 'origin/master' into soc-2014-fluid
Conflicts:
.gitignore
intern/cycles/CMakeLists.txt
source/blender/blenkernel/intern/smoke.c
source/blender/python/intern/bpy_interface.c
source/creator/CMakeLists.txt
Diffstat (limited to 'intern/cycles/kernel')
81 files changed, 4614 insertions, 2506 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index d18f4fa2998..c521e1383a4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -19,12 +19,13 @@ set(SRC set(SRC_HEADERS kernel.h kernel_accumulate.h + kernel_bake.h kernel_camera.h kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h + kernel_debug.h kernel_differential.h - kernel_displace.h kernel_emission.h kernel_film.h kernel_globals.h @@ -35,6 +36,8 @@ set(SRC_HEADERS kernel_passes.h kernel_path.h kernel_path_state.h + kernel_path_surface.h + kernel_path_volume.h kernel_projection.h kernel_random.h kernel_shader.h @@ -58,8 +61,7 @@ set(SRC_CLOSURE_HEADERS closure/bsdf_toon.h closure/bsdf_transparent.h closure/bsdf_util.h - closure/bsdf_ward.h - closure/bsdf_westin.h + closure/bsdf_ashikhmin_shirley.h closure/bsdf_hair.h closure/bssrdf.h closure/emissive.h @@ -95,8 +97,8 @@ set(SRC_SVM_HEADERS svm/svm_noisetex.h svm/svm_normal.h svm/svm_ramp.h - svm/svm_sepcomb_rgb.h svm/svm_sepcomb_hsv.h + svm/svm_sepcomb_vector.h svm/svm_sky.h svm/svm_tex_coord.h svm/svm_texture.h @@ -111,8 +113,10 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_bvh.h + geom/geom_bvh_shadow.h geom/geom_bvh_subsurface.h geom/geom_bvh_traversal.h + geom/geom_bvh_volume.h geom/geom_curve.h geom/geom_motion_curve.h geom/geom_motion_triangle.h @@ -146,50 +150,69 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "60") + if(CUDA_VERSION MATCHES "65") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " - "build may succeed but only CUDA 6.0 is officially supported") + "build may succeed but only CUDA 6.5 is officially supported") endif() # build for each arch set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) set(cuda_cubins) - foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) - set(cuda_cubin kernel_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch experimental) + if(${experimental}) + set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__") + set(cuda_cubin kernel_experimental_${arch}.cubin) + else() + set(cuda_extra_flags "") + set(cuda_cubin kernel_${arch}.cubin) + endif() + + if(WITH_CYCLES_DEBUG) + set(cuda_debug_flags "-D__KERNEL_DEBUG__") + else() + set(cuda_debug_flags "") + endif() set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}") set(cuda_math_flags "--use_fast_math") - if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50") - message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping") - elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35") - message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping") - else() - add_custom_command( - OUTPUT ${cuda_cubin} - COMMAND ${CUDA_NVCC_EXECUTABLE} - -arch=${arch} - -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} - --ptxas-options="-v" - ${cuda_arch_flags} - ${cuda_version_flags} - ${cuda_math_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm - -DCCL_NAMESPACE_BEGIN= - -DCCL_NAMESPACE_END= - -DNVCC - - DEPENDS ${cuda_sources}) - - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND cuda_cubins ${cuda_cubin}) - endif() + add_custom_command( + OUTPUT ${cuda_cubin} + COMMAND ${CUDA_NVCC_EXECUTABLE} + -arch=${arch} + -m${CUDA_BITS} + --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} + --ptxas-options="-v" + ${cuda_arch_flags} + ${cuda_version_flags} + ${cuda_math_flags} + ${cuda_extra_flags} + ${cuda_debug_flags} + -I${CMAKE_CURRENT_SOURCE_DIR}/../util + -I${CMAKE_CURRENT_SOURCE_DIR}/svm + -DCCL_NAMESPACE_BEGIN= + -DCCL_NAMESPACE_END= + -DNVCC + + DEPENDS ${cuda_sources}) + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND cuda_cubins ${cuda_cubin}) + + unset(cuda_extra_flags) + unset(cuda_debug_flags) + endmacro() + + foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) + # Compile regular kernel + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + + # Compile experimental kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE) endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -213,12 +236,14 @@ if(CXX_HAS_SSE) kernel_sse3.cpp kernel_sse41.cpp kernel_avx.cpp + kernel_avx2.cpp ) set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index 04e1bad7538..c0d969e24ae 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -30,6 +30,7 @@ import subprocess import sys import os import Blender as B +import btools def normpath(path): return os.path.abspath(os.path.normpath(path)) @@ -64,49 +65,56 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: closure_dir = os.path.join(source_dir, "../closure") # get CUDA version - nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) - output, erroroutput = nvcc_pipe.communicate() + output = btools.get_command_output([nvcc, "--version"]) cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0] cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1]) - if cuda_version != 60: - print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10)) + if cuda_version != 65: + print("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported." % (cuda_version/10, cuda_version%10)) # nvcc flags nvcc_flags = "-m%s" % (bits) - nvcc_flags += " --cubin --ptxas-options=\"-v\"" + nvcc_flags += " --cubin --ptxas-options=\"-v\" --use_fast_math" nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version) nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC" nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir) + if env['WITH_BF_CYCLES_DEBUG']: + nvcc_flags += " -D__KERNEL_DEBUG__" + # dependencies dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') last_cubin_file = None + configs = (("kernel_%s.cubin", ''), + ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__')) + # add command for each cuda architecture for arch in cuda_archs: - if cuda_version < 60 and arch == "sm_50": - print("Can't build kernel for CUDA sm_50 architecture, skipping") - continue - - cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch) - - if env['BF_CYCLES_CUDA_ENV']: - MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" - command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file) - else: - command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file) - - kernel.Command(cubin_file, 'kernel.cu', command) - kernel.Depends(cubin_file, dependencies) - - kernel_binaries.append(cubin_file) - - if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: - # trick to compile one kernel at a time to reduce memory usage - if last_cubin_file: - kernel.Depends(cubin_file, last_cubin_file) - last_cubin_file = cubin_file + for config in configs: + # TODO(sergey): Use dict instead ocouple in order to increase readability? + name = config[0] + extra_flags = config[1] + + cubin_file = os.path.join(build_dir, name % arch) + current_flags = nvcc_flags + extra_flags + + if env['BF_CYCLES_CUDA_ENV']: + MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" + command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file) + else: + command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file) + + kernel.Command(cubin_file, 'kernel.cu', command) + kernel.Depends(cubin_file, dependencies) + + kernel_binaries.append(cubin_file) + + if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: + # trick to compile one kernel at a time to reduce memory usage + if last_cubin_file: + kernel.Depends(cubin_file, last_cubin_file) + last_cubin_file = cubin_file Return('kernel_binaries') diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 24b54cd9d9e..7d4783b0f3c 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -23,10 +23,7 @@ #include "../closure/bsdf_reflection.h" #include "../closure/bsdf_refraction.h" #include "../closure/bsdf_transparent.h" -#ifdef __ANISOTROPIC__ -#include "../closure/bsdf_ward.h" -#endif -#include "../closure/bsdf_westin.h" +#include "../closure/bsdf_ashikhmin_shirley.h" #include "../closure/bsdf_toon.h" #include "../closure/bsdf_hair.h" #ifdef __SUBSURFACE__ @@ -83,21 +80,22 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - label = bsdf_ward_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); @@ -110,14 +108,6 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); @@ -178,18 +168,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - eval = bsdf_ward_eval_reflect(sc, sd->I, omega_in, pdf); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; @@ -199,12 +190,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade case CLOSURE_BSDF_GLOSSY_TOON_ID: eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; @@ -245,18 +230,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - eval = bsdf_ward_eval_transmit(sc, sd->I, omega_in, pdf); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; @@ -266,12 +252,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade case CLOSURE_BSDF_GLOSSY_TOON_ID: eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; @@ -330,18 +310,19 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) bsdf_transparent_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: bsdf_microfacet_beckmann_blur(sc, roughness); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - bsdf_ward_blur(sc, roughness); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + bsdf_ashikhmin_shirley_blur(sc, roughness); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: bsdf_ashikhmin_velvet_blur(sc, roughness); break; @@ -351,12 +332,6 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) case CLOSURE_BSDF_GLOSSY_TOON_ID: bsdf_glossy_toon_blur(sc, roughness); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - bsdf_westin_backscatter_blur(sc, roughness); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - bsdf_westin_sheen_blur(sc, roughness); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: bsdf_hair_reflection_blur(sc, roughness); diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h new file mode 100644 index 00000000000..ad7864cb8ea --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -0,0 +1,210 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__ +#define __BSDF_ASHIKHMIN_SHIRLEY_H__ + +/* +ASHIKHMIN SHIRLEY BSDF + +Implementation of +Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000) + +The Fresnel factor is missing to get a separable bsdf (intensity*color), as is +the case with all other microfacet-based BSDF implementations in Cycles. + +Other than that, the implementation directly follows the paper. +*/ + +CCL_NAMESPACE_BEGIN + +ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc) +{ + /* store roughness. could already convert to exponent to save some cycles + * in eval, but this is more consistent with other bsdfs and shader_blur. */ + sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); + sc->data1 = sc->data0; + + sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID; + return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY; +} + +ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc) +{ + /* store roughness. could already convert to exponent to save some cycles + * in eval, but this is more consistent with other bsdfs and shader_blur. */ + sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); + sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); + + sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID; + return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY; +} + +ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness) +{ + sc->data0 = fmaxf(roughness, sc->data0); /* clamp roughness */ + sc->data1 = fmaxf(roughness, sc->data1); +} + +ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float roughness) +{ + return 2.0f / (roughness*roughness) - 2.0f; +} + +ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +{ + float3 N = sc->N; + + float NdotI = dot(N, I); /* in Cycles/OSL convention I is omega_out */ + float NdotO = dot(N, omega_in); /* and consequently we use for O omaga_in ;) */ + + float out = 0.0f; + + if (NdotI > 0.0f && NdotO > 0.0f) { + NdotI = fmaxf(NdotI, 1e-6f); + NdotO = fmaxf(NdotO, 1e-6f); + float3 H = normalize(omega_in + I); + float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f); + float HdotN = fmaxf(dot(H, N), 1e-6f); + + float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */ + /*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */ + + float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0); + float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1); + + if (n_x == n_y) { /* => isotropic case */ + float e = n_x; + float lobe = powf(HdotN, e); + float norm = (n_x + 1.0f) / (8.0f * M_PI_F); + + out = NdotO * norm * lobe * pump; + *pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I) (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */ + } + else { /* => ANisotropic case */ + float3 X, Y; + make_orthonormals_tangent(N, sc->T, &X, &Y); + + float HdotX = dot(H, X); + float HdotY = dot(H, Y); + float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN); + float lobe = powf(HdotN, e); + float norm = sqrtf((n_x + 1.0f)*(n_y + 1.0f)) / (8.0f * M_PI_F); + + out = NdotO * norm * lobe * pump; + *pdf = norm * lobe / HdotI; + } + } + + return make_float3(out, out, out); +} + +ccl_device float3 bsdf_ashikhmin_shirley_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x, float n_y, float randu, float randv, float *phi, float *cos_theta) +{ + *phi = atanf(sqrtf((n_x + 1.0f) / (n_y + 1.0f)) * tanf(M_PI_2_F * randu)); + float cos_phi = cosf(*phi); + float sin_phi = sinf(*phi); + *cos_theta = powf(randv, 1.0f / (n_x * cos_phi*cos_phi + n_y * sin_phi*sin_phi + 1.0f)); +} + +ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +{ + float3 N = sc->N; + + float NdotI = dot(N, I); + if (NdotI > 0.0f) { + + float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0); + float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1); + + /* get x,y basis on the surface for anisotropy */ + float3 X, Y; + + if(n_x == n_y) + make_orthonormals(N, &X, &Y); + else + make_orthonormals_tangent(N, sc->T, &X, &Y); + + /* sample spherical coords for h in tangent space */ + float phi; + float cos_theta; + if (n_x == n_y) { /* => simple isotropic sampling */ + phi = M_2PI_F * randu; + cos_theta = powf(randv, 1.0f / (n_x + 1.0f)); + } + else { /* => more complex anisotropic sampling */ + if (randu < 0.25f) { /* first quadrant */ + float remapped_randu = 4.0f * randu; + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + } + else if (randu < 0.5f) { /* second quadrant */ + float remapped_randu = 4.0f * (.5f - randu); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = M_PI_F - phi; + } + else if (randu < 0.75f) { /* third quadrant */ + float remapped_randu = 4.0f * (randu - 0.5f); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = M_PI_F + phi; + } + else { /* fourth quadrant */ + float remapped_randu = 4.0f * (1.0f - randu); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = 2.0f * M_PI_F - phi; + } + } + + /* get half vector in tangent space */ + float sin_theta = sqrtf(fmaxf(0.0f, 1.0f - cos_theta*cos_theta)); + float cos_phi = cosf(phi); + float sin_phi = sinf(phi); /* no sqrt(1-cos^2) here b/c it causes artifacts */ + float3 h = make_float3( + sin_theta * cos_phi, + sin_theta * sin_phi, + cos_theta + ); + + /* half vector to world space */ + float3 H = h.x*X + h.y*Y + h.z*N; + float HdotI = dot(H, I); + if (HdotI < 0.0f) H = -H; + + /* reflect I on H to get omega_in */ + *omega_in = -I + (2.0f * HdotI) * H; + + /* leave the rest to eval_reflect */ + /* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */ + *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + /* just do the reflection thing for now */ + *domega_in_dx = (2.0f * dot(N, dIdx)) * N - dIdx; + *domega_in_dy = (2.0f * dot(N, dIdy)) * N - dIdy; +#endif + } + + return LABEL_REFLECT | LABEL_GLOSSY; +} + + +CCL_NAMESPACE_END + +#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h index 19cdb773255..e0b5454592b 100644 --- a/intern/cycles/kernel/closure/bsdf_hair.h +++ b/intern/cycles/kernel/closure/bsdf_hair.h @@ -63,7 +63,7 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc) ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -120,7 +120,7 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -166,7 +166,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -221,7 +221,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 1ec35e444fe..8737b0e2d94 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -35,20 +35,293 @@ CCL_NAMESPACE_BEGIN -/* GGX */ +/* Approximate erf and erfinv implementations. + * Implementation comes straight from Wikipedia: + * + * http://en.wikipedia.org/wiki/Error_function + * + * Some constants are baked into the code. + */ + +ccl_device_inline float approx_erff_do(float x) +{ + /* Such a clamp doesn't give much distortion to the output value + * and gives quite a few of the speedup. + */ + if(x > 3.0f) { + return 1.0f; + } + float t = 1.0f / (1.0f + 0.47047f*x); + return (1.0f - + t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x)); +} + +ccl_device_inline float approx_erff(float x) +{ + if(x >= 0.0f) { + return approx_erff_do(x); + } + else { + return -approx_erff_do(-x); + } +} + +ccl_device_inline float approx_erfinvf_do(float x) +{ + if(x <= 0.7f) { + const float x2 = x * x; + const float a1 = 0.886226899f; + const float a2 = -1.645349621f; + const float a3 = 0.914624893f; + const float a4 = -0.140543331f; + const float b1 = -2.118377725f; + const float b2 = 1.442710462f; + const float b3 = -0.329097515f; + const float b4 = 0.012229801f; + return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) / + ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f); + } + else { + const float c1 = -1.970840454f; + const float c2 = -1.624906493f; + const float c3 = 3.429567803f; + const float c4 = 1.641345311f; + const float d1 = 3.543889200f; + const float d2 = 1.637067800f; + const float z = sqrtf(-logf((1.0f - x) * 0.5f)); + return (((c4 * z + c3) * z + c2) * z + c1) / + ((d2 * z + d1) * z + 1.0f); + } +} + +ccl_device_inline float approx_erfinvf(float x) +{ + if(x >= 0.0f) { + return approx_erfinvf_do(x); + } + else { + return -approx_erfinvf_do(-x); + } +} + +/* Beckmann and GGX microfacet importance sampling from: + * + * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals. + * E. Heitz and E. d'Eon, EGSR 2014 */ + +ccl_device_inline void microfacet_beckmann_sample_slopes( + KernelGlobals *kg, + const float cos_theta_i, const float sin_theta_i, + float randu, float randv, float *slope_x, float *slope_y, + float *G1i) +{ + /* special case (normal incidence) */ + if(cos_theta_i >= 0.99999f) { + const float r = sqrtf(-logf(randu)); + const float phi = M_2PI_F * randv; + *slope_x = r * cosf(phi); + *slope_y = r * sinf(phi); + *G1i = 1.0f; + return; + } + + /* precomputations */ + const float tan_theta_i = sin_theta_i/cos_theta_i; + const float inv_a = tan_theta_i; + const float a = 1.0f/inv_a; + const float erf_a = approx_erff(a); + const float exp_a2 = expf(-a*a); + const float SQRT_PI_INV = 0.56418958354f; + const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a); + const float G1 = 1.0f/(1.0f + Lambda); /* masking */ + + *G1i = G1; + +#if 0 + const float C = 1.0f - G1 * erf_a; + + /* sample slope X */ + if(randu < C) { + /* rescale randu */ + randu = randu / C; + const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2; + const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a); + const float p = w_1 / (w_1 + w_2); + + if(randu < p) { + randu = randu / p; + *slope_x = -sqrtf(-logf(randu*exp_a2)); + } + else { + randu = (randu - p) / (1.0f - p); + *slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a); + } + } + else { + /* rescale randu */ + randu = (randu - C) / (1.0f - C); + *slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a); + + const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i); + + if(randv > p) { + *slope_x = -(*slope_x); + randv = (randv - p) / (1.0f - p); + } + else + randv = randv / p; + } + + /* sample slope Y */ + *slope_y = approx_erfinvf(2.0f*randv - 1.0f); +#else + /* use precomputed table, because it better preserves stratification + * of the random number pattern */ + int beckmann_table_offset = kernel_data.tables.beckmann_offset; + + *slope_x = lookup_table_read_2D(kg, randu, cos_theta_i, + beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE); + *slope_y = approx_erfinvf(2.0f*randv - 1.0f); +#endif + +} + +ccl_device_inline void microfacet_ggx_sample_slopes( + const float cos_theta_i, const float sin_theta_i, + float randu, float randv, float *slope_x, float *slope_y, + float *G1i) +{ + /* special case (normal incidence) */ + if(cos_theta_i >= 0.99999f) { + const float r = sqrtf(randu/(1.0f - randu)); + const float phi = M_2PI_F * randv; + *slope_x = r * cosf(phi); + *slope_y = r * sinf(phi); + *G1i = 1.0f; + + return; + } + + /* precomputations */ + const float tan_theta_i = sin_theta_i/cos_theta_i; + const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i*tan_theta_i)); + + *G1i = 1.0f/G1_inv; + + /* sample slope_x */ + const float A = 2.0f*randu*G1_inv - 1.0f; + const float AA = A*A; + const float tmp = 1.0f/(AA - 1.0f); + const float B = tan_theta_i; + const float BB = B*B; + const float D = safe_sqrtf(BB*(tmp*tmp) - (AA - BB)*tmp); + const float slope_x_1 = B*tmp - D; + const float slope_x_2 = B*tmp + D; + *slope_x = (A < 0.0f || slope_x_2*tan_theta_i > 1.0f)? slope_x_1: slope_x_2; + + /* sample slope_y */ + float S; + + if(randv > 0.5f) { + S = 1.0f; + randv = 2.0f*(randv - 0.5f); + } + else { + S = -1.0f; + randv = 2.0f*(0.5f - randv); + } + + const float z = (randv*(randv*(randv*0.27385f - 0.73369f) + 0.46341f)) / (randv*(randv*(randv*0.093073f + 0.309420f) - 1.000000f) + 0.597999f); + *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x)); +} + +ccl_device_inline float3 microfacet_sample_stretched( + KernelGlobals *kg, const float3 omega_i, + const float alpha_x, const float alpha_y, + const float randu, const float randv, + bool beckmann, float *G1i) +{ + /* 1. stretch omega_i */ + float3 omega_i_ = make_float3(alpha_x * omega_i.x, alpha_y * omega_i.y, omega_i.z); + omega_i_ = normalize(omega_i_); + + /* get polar coordinates of omega_i_ */ + float costheta_ = 1.0f; + float sintheta_ = 0.0f; + float cosphi_ = 1.0f; + float sinphi_ = 0.0f; + + if(omega_i_.z < 0.99999f) { + costheta_ = omega_i_.z; + sintheta_ = safe_sqrtf(1.0f - costheta_*costheta_); + + float invlen = 1.0f/sintheta_; + cosphi_ = omega_i_.x * invlen; + sinphi_ = omega_i_.y * invlen; + } + + /* 2. sample P22_{omega_i}(x_slope, y_slope, 1, 1) */ + float slope_x, slope_y; + + if(beckmann) { + microfacet_beckmann_sample_slopes(kg, costheta_, sintheta_, + randu, randv, &slope_x, &slope_y, G1i); + } + else { + microfacet_ggx_sample_slopes(costheta_, sintheta_, + randu, randv, &slope_x, &slope_y, G1i); + } + + /* 3. rotate */ + float tmp = cosphi_*slope_x - sinphi_*slope_y; + slope_y = sinphi_*slope_x + cosphi_*slope_y; + slope_x = tmp; + + /* 4. unstretch */ + slope_x = alpha_x * slope_x; + slope_y = alpha_y * slope_y; + + /* 5. compute normal */ + return normalize(make_float3(-slope_x, -slope_y, 1.0f)); +} + +/* GGX microfacet with Smith shadow-masking from: + * + * Microfacet Models for Refraction through Rough Surfaces + * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 + * + * Anisotropic from: + * + * Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs + * E. Heitz, Research Report 2014 + * + * Anisotropy is only supported for reflection currently, but adding it for + * transmission is just a matter of copying code from reflection if needed. */ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; } +ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc) +{ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + + sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; @@ -57,136 +330,250 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc) ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness) { - sc->data0 = fmaxf(roughness, sc->data0); /* m_ag */ + sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */ + sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */ } ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ag = max(sc->data0, 1e-4f); + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; - if(m_refractive || m_ag <= 1e-4f) - return make_float3 (0, 0, 0); + if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNI > 0 && cosNO > 0) { - // get half vector - float3 Hr = normalize(omega_in + I); - // eq. 20: (F*G*D)/(4*in*on) - // eq. 33: first we calculate D(m) with m=Hr: - float alpha2 = m_ag * m_ag; - float cosThetaM = dot(N, Hr); - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + /* get half vector */ + float3 m = normalize(omega_in + I); + float alpha2 = alpha_x * alpha_y; + float D, G1o, G1i; + + if(alpha_x == alpha_y) { + /* isotropic + * eq. 20: (F*G*D)/(4*in*on) + * eq. 33: first we calculate D(m) */ + float cosThetaM = dot(N, m); + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eq. 34: now calculate G1(i,m) and G1(o,m) */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + } + else { + /* anisotropic */ + float3 X, Y, Z = N; + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + float slope_len = 1 + slope_x*slope_x + slope_y*slope_y; + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) and G1(o,m) */ + float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO); + float cosPhiO = dot(I, X); + float sinPhiO = dot(I, Y); + + float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y); + alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO; + + G1o = 2 / (1 + safe_sqrtf(1 + alphaO2 * tanThetaO2)); + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(omega_in, X); + float sinPhiI = dot(omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2)); + } + float G = G1o * G1i; - float out = (G * D) * 0.25f / cosNO; - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / dot(Hr, I); - return make_float3 (out, out, out); + + /* eq. 20 */ + float common = D * 0.25f / cosNO; + float out = G * common; + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* eq. 38 - but see also: + * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf + * pdf = pm * 0.25 / dot(m, I); */ + *pdf = G1o * common; + + return make_float3(out, out, out); } - return make_float3 (0, 0, 0); + + return make_float3(0, 0, 0); } ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ag = max(sc->data0, 1e-4f); - float m_eta = sc->data1; + float alpha_x = sc->data0; + float alpha_y = sc->data1; + float m_eta = sc->data2; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; - if(!m_refractive || m_ag <= 1e-4f) - return make_float3 (0, 0, 0); + if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO <= 0 || cosNI >= 0) - return make_float3 (0, 0, 0); // vectors on same side -- not possible - // compute half-vector of the refraction (eq. 16) + return make_float3(0, 0, 0); /* vectors on same side -- not possible */ + + /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); float3 Ht = normalize(ht); float cosHO = dot(Ht, I); - float cosHI = dot(Ht, omega_in); - // eq. 33: first we calculate D(m) with m=Ht: - float alpha2 = m_ag * m_ag; + + /* those situations makes chi+ terms in eq. 33, 34 be zero */ + if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f) + return make_float3(0.0f, 0.0f, 0.0f); + + float D, G1o, G1i; + + /* eq. 33: first we calculate D(m) with m=Ht: */ + float alpha2 = alpha_x * alpha_y; float cosThetaM = dot(N, Ht); float cosThetaM2 = cosThetaM * cosThetaM; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eq. 34: now calculate G1(i,m) and G1(o,m) */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + float G = G1o * G1i; - // probability - float invHt2 = 1 / dot(ht, ht); - *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO; - return make_float3 (out, out, out); + + /* probability */ + float Ht2 = dot(ht, ht); + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2) + * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + + return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { - float m_ag = sc->data0; + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; float cosNO = dot(N, I); if(cosNO > 0) { float3 X, Y, Z = N; - make_orthonormals(Z, &X, &Y); - // generate a random microfacet normal m - // eq. 35,36: - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alpha2 = m_ag * m_ag; - float tanThetaM2 = alpha2 * randu / (1 - randu); - float cosThetaM = 1 / safe_sqrtf(1 + tanThetaM2); - float sinThetaM = cosThetaM * safe_sqrtf(tanThetaM2); - float phiM = M_2PI_F * randv; - float3 m = (cosf(phiM) * sinThetaM) * X + - (sinf(phiM) * sinThetaM) * Y + - ( cosThetaM) * Z; + + if(alpha_x == alpha_y) + make_orthonormals(Z, &X, &Y); + else + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* importance sampling with distribution of visible normals. vectors are + * transformed to local space before and after */ + float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO); + float3 local_m; + float G1o; + + local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_y, + randu, randv, false, &G1o); + + float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z; + float cosThetaM = local_m.z; + + /* reflection or refraction? */ if(!m_refractive) { float cosMO = dot(m, I); + if(cosMO > 0) { - // eq. 39 - compute actual reflected direction + /* eq. 39 - compute actual reflected direction */ *omega_in = 2 * cosMO * m - I; + if(dot(Ng, *omega_in) > 0) { - if (m_ag <= 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // microfacet normal is visible to this ray - // eq. 33 - float cosThetaM2 = cosThetaM * cosThetaM; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / cosMO; - // eval BRDF*cosNI - float cosNI = dot(N, *omega_in); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); - float G = G1o * G1i; - // eq. 20: (F*G*D)/(4*in*on) - float out = (G * D) * 0.25f / cosNO; + /* microfacet normal is visible to this ray */ + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; + float D, G1i; + + if(alpha_x == alpha_y) { + /* isotropic */ + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eval BRDF*cosNI */ + float cosNI = dot(N, *omega_in); + + /* eq. 34: now calculate G1(i,m) */ + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + } + else { + /* anisotropic distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + float slope_len = 1 + slope_x*slope_x + slope_y*slope_y; + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4); + + /* calculate G1(i,m) */ + float cosNI = dot(N, *omega_in); + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(*omega_in, X); + float sinPhiI = dot(*omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2)); + } + + /* see eval function for derivation */ + float common = (G1o * D) * 0.25f / cosNO; + float out = G1i * common; + *pdf = common; + *eval = make_float3(out, out, out); } @@ -198,14 +585,15 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl } } else { - // CAUTION: the i and o variables are inverted relative to the paper - // eq. 39 - compute actual refractive direction + /* CAUTION: the i and o variables are inverted relative to the paper + * eq. 39 - compute actual refractive direction */ float3 R, T; #ifdef __RAY_DIFFERENTIALS__ float3 dRdx, dRdy, dTdx, dTdy; #endif - float m_eta = sc->data1; + float m_eta = sc->data2; bool inside; + fresnel_dielectric(m_eta, m, I, &R, &T, #ifdef __RAY_DIFFERENTIALS__ dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy, @@ -213,38 +601,43 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl &inside); if(!inside) { + *omega_in = T; #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = dTdx; *domega_in_dy = dTdy; #endif - if (m_ag <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // eq. 33 + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 24 - float pm = D * cosThetaM; - // eval BRDF*cosNI + + /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + + /* eq. 34: now calculate G1(i,m) */ float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); - float G = G1o * G1i; - // eq. 21 + + /* eq. 21 */ float cosHI = dot(m, *omega_in); float cosHO = dot(m, I); float Ht2 = m_eta * cosHI + cosHO; Ht2 *= Ht2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2); - // eq. 38 and eq. 17 - *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2; + + /* see eval function for derivation */ + float common = (G1o * D) * (m_eta * m_eta) / (cosNO * Ht2); + float out = G1i * fabsf(cosHI * cosHO) * common; + *pdf = cosHO * fabsf(cosHI) * common; + *eval = make_float3(out, out, out); } } @@ -253,19 +646,33 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; } -/* BECKMANN */ +/* Beckmann microfacet with Smith shadow-masking from: + * + * Microfacet Models for Refraction through Rough Surfaces + * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 */ ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; } +ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc) +{ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + + sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; +} + ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; @@ -273,155 +680,257 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc) ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness) { - sc->data0 = fmaxf(roughness, sc->data0); /* m_ab */ + sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */ + sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */ } ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ab = max(sc->data0, 1e-4f); + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; - if(m_refractive || m_ab <= 1e-4f) - return make_float3 (0, 0, 0); + if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO > 0 && cosNI > 0) { - // get half vector - float3 Hr = normalize(omega_in + I); - // eq. 20: (F*G*D)/(4*in*on) - // eq. 25: first we calculate D(m) with m=Hr: - float alpha2 = m_ab * m_ab; - float cosThetaM = dot(N, Hr); - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; - float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; - float G = G1o * G1i; - float out = (G * D) * 0.25f / cosNO; - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / dot(Hr, I); - return make_float3 (out, out, out); + /* get half vector */ + float3 m = normalize(omega_in + I); + + float alpha2 = alpha_x * alpha_y; + float D, G1o, G1i; + + if(alpha_x == alpha_y) { + /* isotropic + * eq. 20: (F*G*D)/(4*in*on) + * eq. 25: first we calculate D(m) */ + float cosThetaM = dot(N, m); + float cosThetaM2 = cosThetaM * cosThetaM; + float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); + + /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */ + float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + else { + /* anisotropic */ + float3 X, Y, Z = N; + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) and G1(o,m) */ + float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO); + float cosPhiO = dot(I, X); + float sinPhiO = dot(I, Y); + + float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y); + alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO; + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(omega_in, X); + float sinPhiI = dot(omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + float ao = 1 / (safe_sqrtf(alphaO2 * tanThetaO2)); + float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2)); + G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + + float G = G1o * G1i; + + /* eq. 20 */ + float common = D * 0.25f / cosNO; + float out = G * common; + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* eq. 38 - but see also: + * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf + * pdf = pm * 0.25 / dot(m, I); */ + *pdf = G1o * common; + + return make_float3(out, out, out); } - return make_float3 (0, 0, 0); + + return make_float3(0, 0, 0); } ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ab = max(sc->data0, 1e-4f); - float m_eta = sc->data1; + float alpha_x = sc->data0; + float alpha_y = sc->data1; + float m_eta = sc->data2; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; - if(!m_refractive || m_ab <= 1e-4f) - return make_float3 (0, 0, 0); + if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO <= 0 || cosNI >= 0) - return make_float3 (0, 0, 0); - // compute half-vector of the refraction (eq. 16) + return make_float3(0, 0, 0); + + /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); float3 Ht = normalize(ht); float cosHO = dot(Ht, I); - float cosHI = dot(Ht, omega_in); - // eq. 33: first we calculate D(m) with m=Ht: - float alpha2 = m_ab * m_ab; + + /* those situations makes chi+ terms in eq. 25, 27 be zero */ + if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f) + return make_float3(0.0f, 0.0f, 0.0f); + + /* eq. 25: first we calculate D(m) with m=Ht: */ + float alpha2 = alpha_x * alpha_y; float cosThetaM = min(dot(N, Ht), 1.0f); float cosThetaM2 = cosThetaM * cosThetaM; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; float cosThetaM4 = cosThetaM2 * cosThetaM2; float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + + /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */ + float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; float G = G1o * G1i; - // probability - float invHt2 = 1 / dot(ht, ht); - *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO; - return make_float3 (out, out, out); + + /* probability */ + float Ht2 = dot(ht, ht); + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2) + * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + + return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { - float m_ab = sc->data0; + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; float cosNO = dot(N, I); if(cosNO > 0) { float3 X, Y, Z = N; - make_orthonormals(Z, &X, &Y); - // generate a random microfacet normal m - // eq. 35,36: - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alpha2 = m_ab * m_ab; - float tanThetaM, cosThetaM; - - if(alpha2 == 0.0f) { - tanThetaM = 0.0f; - cosThetaM = 1.0f; - } - else { - tanThetaM = safe_sqrtf(-alpha2 * logf(1 - randu)); - cosThetaM = 1 / safe_sqrtf(1 + tanThetaM * tanThetaM); - } - float sinThetaM = cosThetaM * tanThetaM; - float phiM = M_2PI_F * randv; - float3 m = (cosf(phiM) * sinThetaM) * X + - (sinf(phiM) * sinThetaM) * Y + - ( cosThetaM) * Z; + if(alpha_x == alpha_y) + make_orthonormals(Z, &X, &Y); + else + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* importance sampling with distribution of visible normals. vectors are + * transformed to local space before and after */ + float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO); + float3 local_m; + float G1o; + local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_x, + randu, randv, true, &G1o); + + float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z; + float cosThetaM = local_m.z; + + /* reflection or refraction? */ if(!m_refractive) { float cosMO = dot(m, I); + if(cosMO > 0) { - // eq. 39 - compute actual reflected direction + /* eq. 39 - compute actual reflected direction */ *omega_in = 2 * cosMO * m - I; + if(dot(Ng, *omega_in) > 0) { - if (m_ab <= 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // microfacet normal is visible to this ray - // eq. 25 - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = tanThetaM * tanThetaM; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / cosMO; - // Eval BRDF*cosNI - float cosNI = dot(N, *omega_in); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; - float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + /* microfacet normal is visible to this ray + * eq. 25 */ + float alpha2 = alpha_x * alpha_y; + float D, G1i; + + if(alpha_x == alpha_y) { + /* istropic distribution */ + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; + D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); + + /* eval BRDF*cosNI */ + float cosNI = dot(N, *omega_in); + + /* eq. 26, 27: now calculate G1(i,m) */ + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + else { + /* anisotropic distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) */ + float cosNI = dot(N, *omega_in); + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(*omega_in, X); + float sinPhiI = dot(*omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2)); + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + float G = G1o * G1i; - // eq. 20: (F*G*D)/(4*in*on) - float out = (G * D) * 0.25f / cosNO; + + /* see eval function for derivation */ + float common = D * 0.25f / cosNO; + float out = G * common; + *pdf = G1o * common; + *eval = make_float3(out, out, out); } + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx; *domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy; @@ -430,14 +939,15 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N } } else { - // CAUTION: the i and o variables are inverted relative to the paper - // eq. 39 - compute actual refractive direction + /* CAUTION: the i and o variables are inverted relative to the paper + * eq. 39 - compute actual refractive direction */ float3 R, T; #ifdef __RAY_DIFFERENTIALS__ float3 dRdx, dRdy, dTdx, dTdy; #endif - float m_eta = sc->data1; + float m_eta = sc->data2; bool inside; + fresnel_dielectric(m_eta, m, I, &R, &T, #ifdef __RAY_DIFFERENTIALS__ dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy, @@ -446,39 +956,44 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N if(!inside) { *omega_in = T; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = dTdx; *domega_in_dy = dTdy; #endif - if (m_ab <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { - // some high number for MIS + + if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // eq. 33 + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = tanThetaM * tanThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 24 - float pm = D * cosThetaM; - // eval BRDF*cosNI + + /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + + /* eq. 26, 27: now calculate G1(i,m) */ + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; float G = G1o * G1i; - // eq. 21 + + /* eq. 21 */ float cosHI = dot(m, *omega_in); float cosHO = dot(m, I); float Ht2 = m_eta * cosHI + cosHO; Ht2 *= Ht2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2); - // eq. 38 and eq. 17 - *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2; + + /* see eval function for derivation */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + *eval = make_float3(out, out, out); } } diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b3dcb9dcc38..05816bac2c1 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -111,16 +111,20 @@ ccl_device float fresnel_dielectric_cos(float cosi, float eta) return 1.0f; // TIR(no refracted component) } -ccl_device float fresnel_conductor(float cosi, float eta, float k) +#if 0 +ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k) { - float tmp_f = eta * eta + k * k; - float tmp = tmp_f * cosi * cosi; - float Rparl2 = (tmp - (2.0f * eta * cosi) + 1)/ - (tmp + (2.0f * eta * cosi) + 1); - float Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi * cosi)/ - (tmp_f + (2.0f * eta * cosi) + cosi * cosi); + float3 cosi2 = make_float3(cosi*cosi); + float3 one = make_float3(1.0f, 1.0f, 1.0f); + float3 tmp_f = eta * eta + k * k; + float3 tmp = tmp_f * cosi2; + float3 Rparl2 = (tmp - (2.0f * eta * cosi) + one) / + (tmp + (2.0f * eta * cosi) + one); + float3 Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi2) / + (tmp_f + (2.0f * eta * cosi) + cosi2); return(Rparl2 + Rperp2) * 0.5f; } +#endif ccl_device float smooth_step(float edge0, float edge1, float x) { diff --git a/intern/cycles/kernel/closure/bsdf_ward.h b/intern/cycles/kernel/closure/bsdf_ward.h deleted file mode 100644 index c9de615a011..00000000000 --- a/intern/cycles/kernel/closure/bsdf_ward.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Adapted from Open Shading Language with this license: - * - * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. - * All Rights Reserved. - * - * Modifications Copyright 2011, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Sony Pictures Imageworks nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __BSDF_WARD_H__ -#define __BSDF_WARD_H__ - -CCL_NAMESPACE_BEGIN - -/* WARD */ - -ccl_device int bsdf_ward_setup(ShaderClosure *sc) -{ - sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* m_ax */ - sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); /* m_ay */ - - sc->type = CLOSURE_BSDF_WARD_ID; - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_ward_blur(ShaderClosure *sc, float roughness) -{ - sc->data0 = fmaxf(roughness, sc->data0); /* m_ax */ - sc->data1 = fmaxf(roughness, sc->data1); /* m_ay */ -} - -ccl_device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_ax = sc->data0; - float m_ay = sc->data1; - float3 N = sc->N; - float3 T = sc->T; - - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - - if(cosNI > 0.0f && cosNO > 0.0f) { - cosNO = max(cosNO, 1e-4f); - cosNI = max(cosNI, 1e-4f); - - // get half vector and get x,y basis on the surface for anisotropy - float3 H = normalize(omega_in + I); // normalize needed for pdf - float3 X, Y; - make_orthonormals_tangent(N, T, &X, &Y); - // eq. 4 - float dotx = dot(H, X) / m_ax; - float doty = dot(H, Y) / m_ay; - float dotn = dot(H, N); - float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn); - float denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI)); - float exp_val = expf(-exp_arg); - float out = cosNI * exp_val / denom; - float oh = dot(H, I); - denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn; - *pdf = exp_val / denom; - return make_float3 (out, out, out); - } - - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_ax = sc->data0; - float m_ay = sc->data1; - float3 N = sc->N; - float3 T = sc->T; - - float cosNO = dot(N, I); - if(cosNO > 0.0f) { - // get x,y basis on the surface for anisotropy - float3 X, Y; - make_orthonormals_tangent(N, T, &X, &Y); - // generate random angles for the half vector - // eq. 7 (taking care around discontinuities to keep - //ttoutput angle in the right quadrant) - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alphaRatio = m_ay / m_ax; - float cosPhi, sinPhi; - if(randu < 0.25f) { - float val = 4 * randu; - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = tanPhi * cosPhi; - } - else if(randu < 0.5f) { - float val = 1 - 4 * (0.5f - randu); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - // phi = M_PI_F - phi; - cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = -tanPhi * cosPhi; - } - else if(randu < 0.75f) { - float val = 4 * (randu - 0.5f); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - //phi = M_PI_F + phi; - cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = tanPhi * cosPhi; - } - else { - float val = 1 - 4 * (1 - randu); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - // phi = M_2PI_F - phi; - cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = -tanPhi * cosPhi; - } - // eq. 6 - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float thetaDenom = (cosPhi * cosPhi) / (m_ax * m_ax) + (sinPhi * sinPhi) / (m_ay * m_ay); - float tanTheta2 = -logf(1 - randv) / thetaDenom; - float cosTheta = 1 / sqrtf(1 + tanTheta2); - float sinTheta = cosTheta * sqrtf(tanTheta2); - - float3 h; // already normalized becaused expressed from spherical coordinates - h.x = sinTheta * cosPhi; - h.y = sinTheta * sinPhi; - h.z = cosTheta; - // compute terms that are easier in local space - float dotx = h.x / m_ax; - float doty = h.y / m_ay; - float dotn = h.z; - // transform to world space - h = h.x * X + h.y * Y + h.z * N; - // generate the final sample - float oh = dot(h, I); - *omega_in = 2.0f * oh * h - I; - if(dot(Ng, *omega_in) > 0) { - float cosNI = dot(N, *omega_in); - if(cosNI > 0) { - cosNO = max(cosNO, 1e-4f); - cosNI = max(cosNI, 1e-4f); - - // eq. 9 - float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn); - float denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn; - *pdf = expf(-exp_arg) / denom; - // compiler will reuse expressions already computed - denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI)); - float power = cosNI * expf(-exp_arg) / denom; - *eval = make_float3(power, power, power); -#ifdef __RAY_DIFFERENTIALS__ - *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx; - *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy; -#endif - } - } - } - return LABEL_REFLECT|LABEL_GLOSSY; -} - -CCL_NAMESPACE_END - -#endif /* __BSDF_WARD_H__ */ - diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h deleted file mode 100644 index 9dc1c00bb3d..00000000000 --- a/intern/cycles/kernel/closure/bsdf_westin.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Adapted from Open Shading Language with this license: - * - * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. - * All Rights Reserved. - * - * Modifications Copyright 2011, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Sony Pictures Imageworks nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __BSDF_WESTIN_H__ -#define __BSDF_WESTIN_H__ - -CCL_NAMESPACE_BEGIN - -/* WESTIN BACKSCATTER */ - -ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc) -{ - float roughness = sc->data0; - roughness = clamp(roughness, 1e-5f, 1.0f); - float m_invroughness = 1.0f/roughness; - - sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID; - sc->data0 = m_invroughness; - - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness) -{ - float m_invroughness = sc->data0; - m_invroughness = min(1.0f/roughness, m_invroughness); - sc->data0 = m_invroughness; -} - -ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_invroughness = sc->data0; - float3 N = sc->N; - - // pdf is implicitly 0 (no indirect sampling) - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - if(cosNO > 0 && cosNI > 0) { - float cosine = dot(I, omega_in); - *pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0; - *pdf *= 0.5f * M_1_PI_F; - return make_float3 (*pdf, *pdf, *pdf); - } - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_invroughness = sc->data0; - float3 N = sc->N; - - float cosNO = dot(N, I); - if(cosNO > 0) { -#ifdef __RAY_DIFFERENTIALS__ - *domega_in_dx = dIdx; - *domega_in_dy = dIdy; -#endif - float3 T, B; - make_orthonormals (I, &T, &B); - float phi = M_2PI_F * randu; - float cosTheta = powf(randv, 1 / (m_invroughness + 1)); - float sinTheta2 = 1 - cosTheta * cosTheta; - float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0; - *omega_in = (cosf(phi) * sinTheta) * T + - (sinf(phi) * sinTheta) * B + - (cosTheta) * I; - if(dot(Ng, *omega_in) > 0) { - // common terms for pdf and eval - float cosNI = dot(N, *omega_in); - // make sure the direction we chose is still in the right hemisphere - if(cosNI > 0) - { - *pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness); - *pdf = (m_invroughness + 1) * (*pdf); - *eval = make_float3(*pdf, *pdf, *pdf); - } - } - } - return LABEL_REFLECT|LABEL_GLOSSY; -} - -/* WESTIN SHEEN */ - -ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc) -{ - /* float edginess = sc->data0; */ - sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID; - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness) -{ -} - -ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_edginess = sc->data0; - float3 N = sc->N; - - // pdf is implicitly 0 (no indirect sampling) - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - if(cosNO > 0 && cosNI > 0) { - float sinNO2 = 1 - cosNO * cosNO; - *pdf = cosNI * M_1_PI_F; - float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0; - return make_float3 (westin, westin, westin); - } - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_edginess = sc->data0; - float3 N = sc->N; - - // we are viewing the surface from the right side - send a ray out with cosine - // distribution over the hemisphere - sample_cos_hemisphere(N, randu, randv, omega_in, pdf); - if(dot(Ng, *omega_in) > 0) { - // TODO: account for sheen when sampling - float cosNO = dot(N, I); - float sinNO2 = 1 - cosNO * cosNO; - float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0; - *eval = make_float3(westin, westin, westin); -#ifdef __RAY_DIFFERENTIALS__ - // TODO: find a better approximation for the diffuse bounce - *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx; - *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy; -#endif - } - else { - pdf = 0; - } - return LABEL_REFLECT|LABEL_DIFFUSE; -} - -CCL_NAMESPACE_END - -#endif /* __BSDF_WESTIN_H__ */ - diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index dd7c25d581d..c5336e086b7 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -28,6 +28,13 @@ CCL_NAMESPACE_BEGIN +/* Don't inline intersect functions on GPU, this is faster */ +#ifdef __KERNEL_GPU__ +#define ccl_device_intersect ccl_device_noinline +#else +#define ccl_device_intersect ccl_device_inline +#endif + /* BVH intersection function variations */ #define BVH_INSTANCING 1 @@ -35,6 +42,8 @@ CCL_NAMESPACE_BEGIN #define BVH_HAIR 4 #define BVH_HAIR_MINIMUM_WIDTH 8 +/* Regular BVH traversal */ + #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 #include "geom_bvh_traversal.h" @@ -63,6 +72,8 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_traversal.h" #endif +/* Subsurface scattering BVH traversal */ + #if defined(__SUBSURFACE__) #define BVH_FUNCTION_NAME bvh_intersect_subsurface #define BVH_FUNCTION_FEATURES 0 @@ -93,47 +104,72 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_subsurface.h" #endif +/* Record all BVH intersection for shadows */ + #if defined(__SHADOW_RECORD_ALL__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all #define BVH_FUNCTION_FEATURES 0 #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__INSTANCING__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing #define BVH_FUNCTION_FEATURES BVH_INSTANCING #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__HAIR__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION #include "geom_bvh_shadow.h" #endif -/* to work around titan bug when using arrays instead of textures */ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline +/* Camera inside Volume BVH intersection */ + +#if defined(__VOLUME__) +#define BVH_FUNCTION_NAME bvh_intersect_volume +#define BVH_FUNCTION_FEATURES 0 +#include "geom_bvh_volume.h" #endif -#ifdef __HAIR__ -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) -#else -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) + +#if defined(__VOLUME__) && defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing +#define BVH_FUNCTION_FEATURES BVH_INSTANCING +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION +#include "geom_bvh_volume.h" #endif + +ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, + uint *lcg_state, float difl, float extmax) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -170,14 +206,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I #endif /* __KERNEL_CPU__ */ } -/* to work around titan bug when using arrays instead of textures */ #ifdef __SUBSURFACE__ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) +ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -215,14 +245,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection } #endif -/* to work around titan bug when using arrays instead of textures */ #ifdef __SHADOW_RECORD_ALL__ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -240,20 +264,50 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits); #endif /* __HAIR__ */ -#ifdef __KERNEL_CPU__ - #ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); #endif /* __INSTANCING__ */ return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); +} +#endif + +#ifdef __VOLUME__ +ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, + const Ray *ray, + Intersection *isect) +{ +#ifdef __OBJECT_MOTION__ + if(kernel_data.bvh.have_motion) { +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_hair_motion(kg, ray, isect); +#endif /* __HAIR__ */ + + return bvh_intersect_volume_motion(kg, ray, isect); + } +#endif /* __OBJECT_MOTION__ */ + +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_hair(kg, ray, isect); +#endif /* __HAIR__ */ + +#ifdef __KERNEL_CPU__ + +#ifdef __INSTANCING__ + if(kernel_data.bvh.have_instancing) + return bvh_intersect_volume_instancing(kg, ray, isect); +#endif /* __INSTANCING__ */ + + return bvh_intersect_volume(kg, ray, isect); #else /* __KERNEL_CPU__ */ #ifdef __INSTANCING__ - return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_volume_instancing(kg, ray, isect); #else - return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_volume(kg, ray, isect); #endif /* __INSTANCING__ */ #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h index 98bf82b3b2d..aee4097d77e 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h @@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; /* calculate { c0min, c1min, -c0max, -c1max} */ - __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); - const __m128 tminmax = _mm_xor_ps(minmax, pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -254,8 +252,7 @@ ccl_device bool BVH_FUNCTION_NAME if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif { - float4 Ns = kernel_tex_fetch(__tri_normal, prim); - shader = __float_as_int(Ns.w); + shader = kernel_tex_fetch(__tri_shader, prim); } #ifdef __HAIR__ else { @@ -301,12 +298,12 @@ ccl_device bool BVH_FUNCTION_NAME num_hits_in_instance = 0; #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); isect_array->t = isect_t; - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -348,13 +345,13 @@ ccl_device bool BVH_FUNCTION_NAME } #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); isect_t = tmax; isect_array->t = isect_t; - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index a19f05dd371..a8f57cffa78 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + /* calculate { c0min, c1min, -c0max, -c1max} */ + const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index 9fd40f91471..114d30a479d 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -63,24 +63,28 @@ ccl_device bool BVH_FUNCTION_NAME #endif isect->t = ray->t; - isect->object = OBJECT_NONE; - isect->prim = PRIM_NONE; isect->u = 0.0f; isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps = 0; +#endif #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -151,17 +155,17 @@ ccl_device bool BVH_FUNCTION_NAME /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; /* calculate { c0min, c1min, -c0max, -c1max} */ - __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); - const __m128 tminmax = _mm_xor_ps(minmax, pn); + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { @@ -182,16 +186,16 @@ ccl_device bool BVH_FUNCTION_NAME } #endif - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -203,9 +207,7 @@ ccl_device bool BVH_FUNCTION_NAME #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -228,6 +230,10 @@ ccl_device bool BVH_FUNCTION_NAME --stackPtr; } } + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif } /* if node is leaf, fetch triangle list */ @@ -276,13 +282,17 @@ ccl_device bool BVH_FUNCTION_NAME } } +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif + /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) if(hit) { if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); } #else if(hit && visibility == PATH_RAY_SHADOW_OPAQUE) @@ -304,11 +314,11 @@ ccl_device bool BVH_FUNCTION_NAME #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -334,11 +344,11 @@ ccl_device bool BVH_FUNCTION_NAME #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h new file mode 100644 index 00000000000..9dd8d226f5b --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_volume.h @@ -0,0 +1,322 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) + +ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg, + const Ray *ray, + Intersection *isect) +{ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* ray parameters in registers */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + + const uint visibility = PATH_RAY_ALL_VISIBILITY; + +#if FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + +#if defined(__KERNEL_SSE2__) + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; + + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + bool traverseChild0, traverseChild1; + int nodeAddrChild1; + +#if !defined(__KERNEL_SSE2__) + /* Intersect two child bounding boxes, non-SSE version */ + float t = isect->t; + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); +#endif + +#else // __KERNEL_SSE2__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); +#endif +#endif // __KERNEL_SSE2__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE2__) + bool closestChild1 = (c1min < c0min); +#else + bool closestChild1 = tminmax[1] < tminmax[0]; +#endif + + if(closestChild1) { + int tmp = nodeAddr; + nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; + } + + ++stackPtr; + traversalStack[stackPtr] = nodeAddrChild1; + } + else { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { + /* neither child was intersected */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); + int primAddr = __float_as_int(leaf.x); + +#if FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* primitive intersection */ + for(; primAddr < primAddr2; primAddr++) { + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + + /* intersect ray against primitive */ + uint type = kernel_tex_fetch(__prim_type, primAddr); + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + triangle_intersect(kg, isect, P, dir, visibility, object, primAddr); + break; + } +#if FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + break; + } +#endif +#if FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + else + bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + break; + } +#endif + default: { + break; + } + } + } + } +#if FEATURE(BVH_INSTANCING) + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VOLUME) { + +#if FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* instance pop */ +#if FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef FEATURE +#undef BVH_FUNCTION_NAME +#undef BVH_FUNCTION_FEATURES + diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index e1d225436a6..b6d21c91916 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -214,9 +214,9 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) { - return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2]))); + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); } #endif @@ -238,16 +238,16 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int prim = kernel_tex_fetch(__prim_index, curveAddr); #ifdef __KERNEL_SSE2__ - __m128 vdir = load_m128(dir); - __m128 vcurve_coef[4]; + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; const float3 *curve_coef = (float3 *)vcurve_coef; { - __m128 dtmp = _mm_mul_ps(vdir, vdir); - __m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp))); - __m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss); + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; - __m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]); + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); int2 &v00 = (int2 &)v00vec; int k0 = v00.x + segment; @@ -255,44 +255,44 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int ka = max(k0 - 1, v00.x); int kb = min(k1 + 1, v00.x + v00.y - 1); - __m128 P_curve[4]; + ssef P_curve[4]; if(type & PRIMITIVE_CURVE) { - P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[ka].x); - P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k0].x); - P_curve[2] = _mm_load_ps(&kg->__curve_keys.data[k1].x); - P_curve[3] = _mm_load_ps(&kg->__curve_keys.data[kb].x); + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); } else { int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); } - __m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss)); - __m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn); - __m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy); - __m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - __m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0))); + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - __m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - __m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0); - __m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - __m128 htfm[] = { htfm0, htfm1, htfm2 }; - __m128 vP = load_m128(P); - __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P_curve[0], vP)); - __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P_curve[1], vP)); - __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P_curve[2], vP)); - __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P_curve[3], vP)); + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); float fc = 0.71f; - __m128 vfc = _mm_set1_ps(fc); - __m128 vfcxp3 = _mm_mul_ps(vfc, p3); + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; vcurve_coef[0] = p1; - vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0)); - vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3)); + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); r_st = ((float4 &)P_curve[1]).w; r_en = ((float4 &)P_curve[2]).w; @@ -386,12 +386,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float i_st = tree * resol; float i_en = i_st + (level * resol); #ifdef __KERNEL_SSE2__ - __m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en); - __m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - __m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - __m128 vbmin = _mm_min_ps(vp_st, vp_en); - __m128 vbmax = _mm_max_ps(vp_st, vp_en); + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; @@ -600,13 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect #endif { /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; isect->prim = curveAddr; isect->object = object; isect->type = type; - isect->u = u; - isect->v = gd; - /*isect->transparency = 1.0f - coverage; */ - isect->t = t; hit = true; } @@ -679,38 +678,38 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float sphere_b_tmp = dot3(dir, sphere_dif1); float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; #else - __m128 P_curve[2]; + ssef P_curve[2]; if(type & PRIMITIVE_CURVE) { - P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[k0].x); - P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k1].x); + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); } else { int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); } - const __m128 or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - __m128 r12 = or12; - const __m128 vP = load_m128(P); - const __m128 dif = _mm_sub_ps(vP, P_curve[0]); - const __m128 dif_second = _mm_sub_ps(vP, P_curve[1]); + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; if(difl != 0.0f) { - const __m128 len1_sq = len3_squared_splat(dif); - const __m128 len2_sq = len3_squared_splat(dif_second); - const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax)); - r12 = _mm_max_ps(or12, pixelsize12); + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); } - float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); - float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); - - const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]); - const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); - const __m128 dir = load_m128(direction); - const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1); #endif float mr = max(r1, r2); @@ -728,7 +727,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #ifndef __KERNEL_SSE2__ float3 tg = p21_diff * invl; #else - const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl)); + const ssef tg = p21_diff * invl; #endif float gd = (r2 - r1) * invl; @@ -752,7 +751,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float3 cprod = cross(tg, dir); float cprod2sq = len3_squared(cross(tg, dif)); #else - const __m128 cprod = cross(tg, dir); + const ssef cprod = cross(tg, dir); float cprod2sq = len3_squared(cross_zxy(tg, dif)); #endif float cprodsq = len3_squared(cprod); @@ -770,7 +769,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #ifndef __KERNEL_SSE2__ float3 tdif = dif + tcentre * dir; #else - const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif); + const ssef tdif = madd(ssef(tcentre), dir, dif); #endif float tdifz = dot3(tdif, tg); float tdifma = tdifz*gd + r1; @@ -836,13 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #endif { /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; isect->prim = curveAddr; isect->object = object; isect->type = type; - isect->u = z*invl; - isect->v = gd; - /*isect->transparency = 1.0f - adjradius;*/ - isect->t = t; return true; } @@ -938,9 +936,10 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con sd->u = isect->u; sd->v = 0.0f; #endif - + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); sd->Ng = normalize(-(D - tg * (dot(tg, D)))); } else { @@ -952,7 +951,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float gd = isect->v; if(gd != 0.0f) { - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); sd->Ng = sd->Ng - gd * tg; sd->Ng = normalize(sd->Ng); } @@ -1012,10 +1010,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con sd->dPdv = cross(tg, sd->Ng); #endif - /*add fading parameter for minimum pixel width with transparency bsdf*/ - /*sd->curve_transparency = isect->transparency;*/ - /*sd->curve_radius = sd->u * gd * l + r1;*/ - if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform tfm = sd->ob_tfm; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index 73338bb6b3b..3a4b20e61aa 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -233,8 +233,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface) { /* get shader */ - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - sd->shader = __float_as_int(Ns.w); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* get motion info */ int numsteps, numverts; @@ -273,7 +272,11 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD #endif /* compute face normal */ - float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); + float3 Ng; + if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); + else + Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); sd->Ng = Ng; sd->N = Ng; @@ -327,14 +330,21 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection float t, u, v; if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { - isect->prim = triAddr; - isect->object = object; - isect->type = PRIMITIVE_MOTION_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) +#endif + { + isect->t = t; + isect->u = u; + isect->v = v; + isect->prim = triAddr; + isect->object = object; + isect->type = PRIMITIVE_MOTION_TRIANGLE; - return true; + return true; + } } return false; @@ -378,12 +388,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I /* record intersection */ Intersection *isect = &isect_array[hit]; + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_MOTION_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; } } #endif diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 533973621d7..5df6c75df86 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -143,6 +143,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) /* center position */ float3 center; +#ifdef __HAIR__ if(sd->type & PRIMITIVE_ALL_CURVE) { center = curve_motion_center_location(kg, sd); @@ -150,6 +151,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) object_position_transform(kg, sd, ¢er); } else +#endif center = sd->P; float3 motion_pre = center, motion_post = center; diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 355e36fef0c..c08a82ee038 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -18,7 +18,7 @@ /* Triangle Primitive * * Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH - * ray intersection we use a precomputed triangle storage to accelarate + * ray intersection we use a precomputed triangle storage to accelerate * intersection at the cost of more memory usage */ CCL_NAMESPACE_BEGIN @@ -116,11 +116,28 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderDat #endif } +/* normal on triangle */ +ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) +{ + /* load triangle vertices */ + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + + float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + + /* return normal */ + if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + return normalize(cross(v2 - v0, v1 - v0)); + else + return normalize(cross(v1 - v0, v2 - v0)); +} + /* point and normal on triangle */ -ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) +ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) { /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -130,16 +147,24 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float float t = 1.0f - u - v; *P = (u*v0 + v*v1 + t*v2); - float4 Nm = kernel_tex_fetch(__tri_normal, prim); - *Ng = make_float3(Nm.x, Nm.y, Nm.z); - *shader = __float_as_int(Nm.w); + /* get object flags, instance-aware */ + int object_flag = kernel_tex_fetch(__object_flag, object >= 0 ? object : ~object); + + /* compute normal */ + if(object_flag & SD_NEGATIVE_SCALE_APPLIED) + *Ng = normalize(cross(v2 - v0, v1 - v0)); + else + *Ng = normalize(cross(v1 - v0, v2 - v0)); + + /* shader`*/ + *shader = kernel_tex_fetch(__tri_shader, prim); } /* Triangle vertex locations */ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3]) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -151,7 +176,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) { /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); @@ -165,7 +190,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv) { /* fetch triangle vertex coordinates */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -187,7 +212,7 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s return kernel_tex_fetch(__attributes_float, offset + sd->prim); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); @@ -230,7 +255,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); @@ -243,11 +268,20 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } - else if(elem == ATTR_ELEMENT_CORNER) { + else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) { int tri = offset + sd->prim*3; - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + float3 f0, f1, f2; + + if(elem == ATTR_ELEMENT_CORNER) { + f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); + f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); + f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + } + else { + f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 0)); + f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 1)); + f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 2)); + } #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; @@ -300,12 +334,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect #endif { /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; return true; } } @@ -363,12 +397,12 @@ ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersec /* record intersection */ Intersection *isect = &isect_array[hit]; + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; } } } diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 963d6cbee9c..3cb6d168f80 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -49,7 +49,15 @@ ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); - float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#ifdef __KERNEL_GPU__ + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +#else + float4 r; + if(sd->flag & SD_VOLUME_CUBIC) + r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC); + else + r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#endif if(dx) *dx = 0.0f; if(dx) *dy = 0.0f; @@ -61,7 +69,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); - float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#ifdef __KERNEL_GPU__ + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +#else + float4 r; + if(sd->flag & SD_VOLUME_CUBIC) + r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC); + else + r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#endif if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl index 6988ad6027f..4f20ef9ca15 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernel.cl @@ -23,7 +23,7 @@ #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, @@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader( ccl_global type *name, #include "kernel_textures.h" - int type, int sx, int sw) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -128,6 +128,31 @@ __kernel void kernel_ocl_shader( int x = sx + get_global_id(0); if(x < sx + sw) - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x); + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); +} + +__kernel void kernel_ocl_bake( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel_textures.h" + + int type, int sx, int sw, int offset, int sample) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel_textures.h" + + int x = sx + get_global_id(0); + + if(x < sx + sw) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); } diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index 173028d50c8..fa2113fbb46 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -23,7 +23,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -120,9 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu /* Shader Evaluation */ -void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index 636e48b5456..489daacddde 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -22,7 +22,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" /* device data taken from CUDA occupancy calculator */ @@ -52,8 +52,20 @@ #define CUDA_KERNEL_MAX_REGISTERS 63 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 -/* 5.0 */ -#elif __CUDA_ARCH__ == 500 +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +#define CUDA_BLOCK_MAX_THREADS 1024 +#define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +#define CUDA_THREADS_BLOCK_WIDTH 16 +#define CUDA_KERNEL_MAX_REGISTERS 63 +#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0 and 5.2 */ +#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 #define CUDA_BLOCK_MAX_THREADS 1024 @@ -61,12 +73,12 @@ /* tunable parameters */ #define CUDA_THREADS_BLOCK_WIDTH 16 -#define CUDA_KERNEL_MAX_REGISTERS 63 +#define CUDA_KERNEL_MAX_REGISTERS 40 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 /* unknown architecture */ #else -#error "Unknown or unuspported CUDA architecture, can't determine launch bounds" +#error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif /* compute number of threads per block and minimum blocks per multiprocessor @@ -146,11 +158,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) +kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + if(x < sx + sw) + kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x); + if(x < sx + sw) + kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample); } #endif diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index c4a08646bab..19e06b88797 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, @@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 @@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 @@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX @@ -84,7 +84,18 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i, int offset, int sample); #endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index b4f6dcdace9..b0efcdc66a7 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -32,10 +32,11 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v eval->transmission = make_float3(0.0f, 0.0f, 0.0f); eval->transparent = make_float3(0.0f, 0.0f, 0.0f); eval->subsurface = make_float3(0.0f, 0.0f, 0.0f); + eval->scatter = make_float3(0.0f, 0.0f, 0.0f); if(type == CLOSURE_BSDF_TRANSPARENT_ID) eval->transparent = value; - else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type)) + else if(CLOSURE_IS_BSDF_DIFFUSE(type)) eval->diffuse = value; else if(CLOSURE_IS_BSDF_GLOSSY(type)) eval->glossy = value; @@ -43,6 +44,8 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v eval->transmission = value; else if(CLOSURE_IS_BSDF_BSSRDF(type)) eval->subsurface = value; + else if(CLOSURE_IS_PHASE(type)) + eval->scatter = value; } else eval->diffuse = value; @@ -51,11 +54,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v #endif } -ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */ +#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500 +ccl_device_inline +#else +ccl_device_noinline +#endif +void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) { #ifdef __PASSES__ if(eval->use_light_pass) { - if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type)) + if(CLOSURE_IS_BSDF_DIFFUSE(type)) eval->diffuse += value; else if(CLOSURE_IS_BSDF_GLOSSY(type)) eval->glossy += value; @@ -63,6 +72,8 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 eval->transmission += value; else if(CLOSURE_IS_BSDF_BSSRDF(type)) eval->subsurface += value; + else if(CLOSURE_IS_PHASE(type)) + eval->scatter += value; /* skipping transparent, this function is used by for eval(), will be zero then */ } @@ -81,7 +92,8 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) && is_zero(eval->glossy) && is_zero(eval->transmission) && is_zero(eval->transparent) - && is_zero(eval->subsurface); + && is_zero(eval->subsurface) + && is_zero(eval->scatter); } else return is_zero(eval->diffuse); @@ -98,6 +110,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value) eval->glossy *= value; eval->transmission *= value; eval->subsurface *= value; + eval->scatter *= value; /* skipping transparent, this function is used by for eval(), will be zero then */ } @@ -111,7 +124,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value) /* Path Radiance * * We accumulate different render passes separately. After summing at the end - * to get the combined result, it should be identical. We definte directly + * to get the combined result, it should be identical. We definite directly * visible as the first non-transparent hit, while indirectly visible are the * bounces after that. */ @@ -130,21 +143,25 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->color_glossy = make_float3(0.0f, 0.0f, 0.0f); L->color_transmission = make_float3(0.0f, 0.0f, 0.0f); L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->color_scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f); L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f); L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f); L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f); L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f); L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); @@ -174,14 +191,16 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throug L->path_glossy = bsdf_eval->glossy*value; L->path_transmission = bsdf_eval->transmission*value; L->path_subsurface = bsdf_eval->subsurface*value; + L->path_scatter = bsdf_eval->scatter*value; - *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface; + *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter; L->direct_throughput = *throughput; } else { /* transparent bounce before first hit, or indirectly visible through BSDF */ - float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + bsdf_eval->subsurface)*inverse_pdf; + float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + + bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf; *throughput *= sum; } } @@ -241,6 +260,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through L->direct_glossy += throughput*bsdf_eval->glossy*shadow; L->direct_transmission += throughput*bsdf_eval->transmission*shadow; L->direct_subsurface += throughput*bsdf_eval->subsurface*shadow; + L->direct_scatter += throughput*bsdf_eval->scatter*shadow; if(is_lamp) { L->shadow.x += shadow.x*shadow_fac; @@ -250,7 +270,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } else { /* indirectly visible lighting after BSDF bounce */ - float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface; + float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter; L->indirect += throughput*sum*shadow; } } @@ -291,12 +311,14 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) L->direct_glossy += L->path_glossy*L->direct_emission; L->direct_transmission += L->path_transmission*L->direct_emission; L->direct_subsurface += L->path_subsurface*L->direct_emission; + L->direct_scatter += L->path_scatter*L->direct_emission; L->indirect = safe_divide_color(L->indirect, L->direct_throughput); L->indirect_diffuse += L->path_diffuse*L->indirect; L->indirect_glossy += L->path_glossy*L->indirect; L->indirect_transmission += L->path_transmission*L->indirect; L->indirect_subsurface += L->path_subsurface*L->indirect; + L->indirect_scatter += L->path_scatter*L->indirect; } #endif } @@ -309,6 +331,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->indirect = make_float3(0.0f, 0.0f, 0.0f); @@ -327,8 +350,8 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi if(L->use_light_pass) { path_radiance_sum_indirect(L); - L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->emission; - L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface; + L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->direct_scatter + L->emission; + L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface + L->indirect_scatter; if(!kernel_data.background.transparent) L_direct += L->background; @@ -344,11 +367,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f); L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f); L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f); L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f); L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f); L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); L->emission = make_float3(0.0f, 0.0f, 0.0f); } @@ -368,6 +393,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->direct_glossy *= scale; L->direct_transmission *= scale; L->direct_subsurface *= scale; + L->direct_scatter *= scale; L->emission *= scale; L->background *= scale; } @@ -382,6 +408,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->indirect_glossy *= scale; L->indirect_transmission *= scale; L->indirect_subsurface *= scale; + L->indirect_scatter *= scale; } /* Sum again, after clamping */ @@ -416,11 +443,13 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance L->direct_glossy += L_sample->direct_glossy*fac; L->direct_transmission += L_sample->direct_transmission*fac; L->direct_subsurface += L_sample->direct_subsurface*fac; + L->direct_scatter += L_sample->direct_scatter*fac; L->indirect_diffuse += L_sample->indirect_diffuse*fac; L->indirect_glossy += L_sample->indirect_glossy*fac; L->indirect_transmission += L_sample->indirect_transmission*fac; L->indirect_subsurface += L_sample->indirect_subsurface*fac; + L->indirect_scatter += L_sample->indirect_scatter*fac; L->emission += L_sample->emission*fac; L->background += L_sample->background*fac; diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp index 354214c406e..e7ff21a6f09 100644 --- a/intern/cycles/kernel/kernel_avx.cpp +++ b/intern/cycles/kernel/kernel_avx.cpp @@ -24,6 +24,7 @@ #define __KERNEL_SSE3__ #define __KERNEL_SSSE3__ #define __KERNEL_SSE41__ +#define __KERNEL_AVX__ #endif #include "util_optimization.h" @@ -37,7 +38,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -67,9 +68,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float /* Shader Evaluate */ -void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp new file mode 100644 index 00000000000..cb1662bbfbe --- /dev/null +++ b/intern/cycles/kernel/kernel_avx2.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +#define __KERNEL_SSE2__ +#define __KERNEL_SSE3__ +#define __KERNEL_SSSE3__ +#define __KERNEL_SSE41__ +#define __KERNEL_AVX__ +#define __KERNEL_AVX2__ +#endif + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_bake.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) + kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + else +#endif + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Film */ + +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) +{ + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); +} + +CCL_NAMESPACE_END +#else + +/* needed for some linkers in combination with scons making empty compilation unit in a library */ +void __dummy_function_cycles_avx2(void); +void __dummy_function_cycles_avx2(void) {} + +#endif diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_bake.h index b8c64af658f..a1ec080e3d3 100644 --- a/intern/cycles/kernel/kernel_displace.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -17,65 +17,125 @@ CCL_NAMESPACE_BEGIN ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng, - bool is_combined, bool is_ao, bool is_sss) + const bool is_combined, const bool is_ao, const bool is_sss, int sample) { - int samples = kernel_data.integrator.aa_samples; - /* initialize master radiance accumulator */ kernel_assert(kernel_data.film.use_light_pass); path_radiance_init(L, kernel_data.film.use_light_pass); - /* take multiple samples */ - for(int sample = 0; sample < samples; sample++) { - PathRadiance L_sample; - PathState state; - Ray ray; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + PathRadiance L_sample; + PathState state; + Ray ray; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + bool is_sss_sample = is_sss; - /* init radiance */ - path_radiance_init(&L_sample, kernel_data.film.use_light_pass); + /* init radiance */ + path_radiance_init(&L_sample, kernel_data.film.use_light_pass); - /* init path state */ - path_state_init(kg, &state, &rng, sample); - state.num_samples = samples; + /* init path state */ + path_state_init(kg, &state, &rng, sample, NULL); - /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + /* evaluate surface shader */ + float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); + shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - /* TODO, disable the closures we won't need */ + /* TODO, disable the closures we won't need */ + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { + /* regular path tracer */ +#endif /* sample ambient occlusion */ if(is_combined || is_ao) { kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput); } - /* sample subsurface scattering */ - if((is_combined || is_sss) && (sd->flag & SD_BSSRDF)) { #ifdef __SUBSURFACE__ + /* sample subsurface scattering */ + if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput)) - is_sss = true; -#endif + is_sss_sample = true; } +#endif /* sample light and BSDF */ - if((!is_sss) && (!is_ao)) { - if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if((!is_sss_sample) && (!is_ao)) { + + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); + path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + } + + kernel_path_surface_connect_light(kg, &rng, sd, throughput, &state, &L_sample); + + if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample); + kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); path_radiance_reset_indirect(&L_sample); } } +#ifdef __BRANCHED_PATH__ + } + else { + /* branched path tracer */ + + /* sample ambient occlusion */ + if(is_combined || is_ao) { + kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput); + } + +#ifdef __SUBSURFACE__ + /* sample subsurface scattering */ + if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { + /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ + kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, throughput); + } +#endif + + /* sample light and BSDF */ + if((!is_sss_sample) && (!is_ao)) { + + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); + path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + } + +#if defined(__EMISSION__) + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, &rng, + sd, &state, throughput, 1.0f, &L_sample, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, &rng, + sd, throughput, 1.0f, &state, &L_sample); + } + } +#endif + + /* accumulate into master L */ + path_radiance_accum_sample(L, &L_sample, 1); +} - /* accumulate into master L */ - path_radiance_accum_sample(L, &L_sample, samples); +ccl_device bool is_aa_pass(ShaderEvalType type) +{ + switch(type) { + case SHADER_EVAL_UV: + case SHADER_EVAL_NORMAL: + return false; + default: + return true; } } @@ -99,7 +159,21 @@ ccl_device bool is_light_pass(ShaderEvalType type) } } -ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i) +#if 0 +ccl_device_inline float bake_clamp_mirror_repeat(float u) +{ + /* use mirror repeat (like opengl texture) so that if the barycentric + * coordinate goes past the end of the triangle it is not always clamped + * to the same value, gives ugly patterns */ + float fu = floorf(u); + u = u - fu; + + return (((int)fu) & 1)? 1.0f - u: u; +} +#endif + +ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, + ShaderEvalType type, int i, int offset, int sample) { ShaderData sd; uint4 in = input[i * 2]; @@ -121,10 +195,28 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, float dvdx = __uint_as_float(diff.z); float dvdy = __uint_as_float(diff.w); + int num_samples = kernel_data.integrator.aa_samples; + + /* random number generator */ + RNG rng = cmj_hash(offset + i, 0); + +#if 0 + uint rng_state = cmj_hash(i, 0); + float filter_x, filter_y; + path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y); + + /* subpixel u/v offset */ + if(sample > 0) { + u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f)); + v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f)); + } +#endif + + /* triangle */ int shader; float3 P, Ng; - triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader); + triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); /* dummy initilizations copied from SHADER_EVAL_DISPLACE */ float3 I = Ng; @@ -147,12 +239,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, sd.dv.dx = dvdx; sd.dv.dy = dvdy; + /* light passes */ if(is_light_pass(type)) { - RNG rng = cmj_hash(i, 0); - compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED), - (type == SHADER_EVAL_AO), - (type == SHADER_EVAL_SUBSURFACE_DIRECT || - type == SHADER_EVAL_SUBSURFACE_INDIRECT)); + compute_light_pass(kg, &sd, &L, rng, + (type == SHADER_EVAL_COMBINED), + (type == SHADER_EVAL_AO), + (type == SHADER_EVAL_SUBSURFACE_DIRECT || + type == SHADER_EVAL_SUBSURFACE_INDIRECT), + sample); } switch (type) { @@ -307,17 +401,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } /* write output */ - output[i] = make_float4(out.x, out.y, out.z, 1.0f); - return; + float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f; + + if(sample == 0) + output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac; + else + output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac; } -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i) +ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample) { - if(type >= SHADER_EVAL_BAKE) { - kernel_bake_evaluate(kg, input, output, type, i); - return; - } - ShaderData sd; uint4 in = input[i]; float3 out; @@ -363,7 +456,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu } /* write output */ - output[i] = make_float4(out.x, out.y, out.z, 0.0f); + if(sample == 0) + output[i] = make_float4(out.x, out.y, out.z, 0.0f); + else + output[i] += make_float4(out.x, out.y, out.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index 7fc66a9fdee..5c83358a56d 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -21,16 +21,22 @@ CCL_NAMESPACE_BEGIN ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) { float blades = kernel_data.cam.blades; + float2 bokeh; if(blades == 0.0f) { /* sample disk */ - return concentric_sample_disk(u, v); + bokeh = concentric_sample_disk(u, v); } else { /* sample polygon */ float rotation = kernel_data.cam.bladesrotation; - return regular_polygon_sample(blades, rotation, u, v); + bokeh = regular_polygon_sample(blades, rotation, u, v); } + + /* anamorphic lens bokeh */ + bokeh.x *= kernel_data.cam.inv_aperture_ratio; + + return bokeh; } ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) @@ -183,7 +189,8 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* calculate orthonormal coordinates perpendicular to D */ float3 U, V; - make_orthonormals(D, &U, &V); + U = normalize(make_float3(1.0f, 0.0f, 0.0f) - D.x * D); + V = normalize(cross(D, U)); /* update ray for effect of lens */ ray->P = U * lensuv.x + V * lensuv.y; @@ -262,6 +269,20 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P) return len(P - camP); } +ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P) +{ + Transform cameratoworld = kernel_data.cam.cameratoworld; + + if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) { + float3 camD = make_float3(cameratoworld.x.z, cameratoworld.y.z, cameratoworld.z.z); + return -camD; + } + else { + float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w); + return normalize(camP - P); + } +} + ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P) { if(kernel_data.cam.type != CAMERA_PANORAMA) { diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index d027bb62ebe..37cba03ff97 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -25,6 +25,13 @@ #include "util_half.h" #include "util_types.h" +/* On 64bit linux single precision exponent is really slow comparing to the + * double precision version, even with float<->double conversion involved. + */ +#if !defined(__KERNEL_GPU__) && defined(__linux__) && defined(__x86_64__) +# define expf(x) ((float)exp((double)(x))) +#endif + CCL_NAMESPACE_BEGIN /* Assertions inside the kernel only work for the CPU device, so we wrap it in @@ -44,16 +51,16 @@ template<typename T> struct texture { } #if 0 - ccl_always_inline __m128 fetch_m128(int index) + ccl_always_inline ssef fetch_ssef(int index) { kernel_assert(index >= 0 && index < width); - return ((__m128*)data)[index]; + return ((ssef*)data)[index]; } - ccl_always_inline __m128i fetch_m128i(int index) + ccl_always_inline ssei fetch_ssei(int index) { kernel_assert(index >= 0 && index < width); - return ((__m128i*)data)[index]; + return ((ssei*)data)[index]; } #endif @@ -144,6 +151,13 @@ template<typename T> struct texture_image { ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false) { + return interp_3d_ex(x, y, z, interpolation, periodic); + } + + ccl_always_inline float4 interp_3d_ex(float x, float y, float z, + int interpolation = INTERPOLATION_LINEAR, + bool periodic = false) + { if(UNLIKELY(!data)) return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -167,7 +181,7 @@ template<typename T> struct texture_image { return read(data[ix + iy*width + iz*width*height]); } - else { + else if(interpolation == INTERPOLATION_LINEAR) { float tx = frac(x*(float)width - 0.5f, &ix); float ty = frac(y*(float)height - 0.5f, &iy); float tz = frac(z*(float)depth - 0.5f, &iz); @@ -205,6 +219,93 @@ template<typename T> struct texture_image { return r; } + else { + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + if(periodic) { + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + } + else { + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define SET_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_SPLINE_WEIGHTS(u, tx); + SET_SPLINE_WEIGHTS(v, ty); + SET_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA +#undef SET_SPLINE_WEIGHTS + } } ccl_always_inline void dimensions_set(int width_, int height_, int depth_) @@ -232,11 +333,12 @@ typedef texture_image<uchar4> texture_image_uchar4; /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) -#define kernel_tex_fetch_m128(tex, index) (kg->tex.fetch_m128(index)) -#define kernel_tex_fetch_m128i(tex, index) (kg->tex.fetch_m128i(index)) +#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) +#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y)) #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z)) +#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation)) #define kernel_data (kg->__data) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e4c20d26ff1..f14f3262274 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -75,12 +75,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; /* Use fast math functions */ -#define cosf(x) __cosf(((float)x)) -#define sinf(x) __sinf(((float)x)) -#define powf(x, y) __powf(((float)x), ((float)y)) -#define tanf(x) __tanf(((float)x)) -#define logf(x) __logf(((float)x)) -#define expf(x) __expf(((float)x)) +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) #endif /* __KERNEL_COMPAT_CUDA_H__ */ - diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 8346b09619e..58031a41b78 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -24,14 +24,6 @@ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END -#ifdef __KERNEL_OPENCL_AMD__ -#define __CL_NO_FLOAT3__ -#endif - -#ifdef __CL_NO_FLOAT3__ -#define float3 float4 -#endif - #ifdef __CL_NOINLINE__ #define ccl_noinline __attribute__((noinline)) #else @@ -68,51 +60,51 @@ #ifdef make_int4 #undef make_int4 #endif +#ifdef make_uchar4 +#undef make_uchar4 +#endif #define make_float2(x, y) ((float2)(x, y)) -#ifdef __CL_NO_FLOAT3__ -#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f)) -#else #define make_float3(x, y, z) ((float3)(x, y, z)) -#endif #define make_float4(x, y, z, w) ((float4)(x, y, z, w)) #define make_int2(x, y) ((int2)(x, y)) #define make_int3(x, y, z) ((int3)(x, y, z)) #define make_int4(x, y, z, w) ((int4)(x, y, z, w)) +#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w)) /* math functions */ #define __uint_as_float(x) as_float(x) #define __float_as_uint(x) as_uint(x) #define __int_as_float(x) as_float(x) #define __float_as_int(x) as_int(x) -#define powf(x, y) pow(((float)x), ((float)y)) -#define fabsf(x) fabs(((float)x)) -#define copysignf(x, y) copysign(((float)x), ((float)y)) -#define asinf(x) asin(((float)x)) -#define acosf(x) acos(((float)x)) -#define atanf(x) atan(((float)x)) -#define floorf(x) floor(((float)x)) -#define ceilf(x) ceil(((float)x)) -#define hypotf(x, y) hypot(((float)x), ((float)y)) -#define atan2f(x, y) atan2(((float)x), ((float)y)) -#define fmaxf(x, y) fmax(((float)x), ((float)y)) -#define fminf(x, y) fmin(((float)x), ((float)y)) -#define fmodf(x, y) fmod((float)x, (float)y) +#define powf(x, y) pow(((float)(x)), ((float)(y))) +#define fabsf(x) fabs(((float)(x))) +#define copysignf(x, y) copysign(((float)(x)), ((float)(y))) +#define asinf(x) asin(((float)(x))) +#define acosf(x) acos(((float)(x))) +#define atanf(x) atan(((float)(x))) +#define floorf(x) floor(((float)(x))) +#define ceilf(x) ceil(((float)(x))) +#define hypotf(x, y) hypot(((float)(x)), ((float)(y))) +#define atan2f(x, y) atan2(((float)(x)), ((float)(y))) +#define fmaxf(x, y) fmax(((float)(x)), ((float)(y))) +#define fminf(x, y) fmin(((float)(x)), ((float)(y))) +#define fmodf(x, y) fmod((float)(x), (float)(y)) #ifndef __CL_USE_NATIVE__ -#define sinf(x) native_sin(((float)x)) -#define cosf(x) native_cos(((float)x)) -#define tanf(x) native_tan(((float)x)) -#define expf(x) native_exp(((float)x)) -#define sqrtf(x) native_sqrt(((float)x)) -#define logf(x) native_log(((float)x)) +#define sinf(x) native_sin(((float)(x))) +#define cosf(x) native_cos(((float)(x))) +#define tanf(x) native_tan(((float)(x))) +#define expf(x) native_exp(((float)(x))) +#define sqrtf(x) native_sqrt(((float)(x))) +#define logf(x) native_log(((float)(x))) #else -#define sinf(x) sin(((float)x)) -#define cosf(x) cos(((float)x)) -#define tanf(x) tan(((float)x)) -#define expf(x) exp(((float)x)) -#define sqrtf(x) sqrt(((float)x)) -#define logf(x) log(((float)x)) +#define sinf(x) sin(((float)(x))) +#define cosf(x) cos(((float)(x))) +#define tanf(x) tan(((float)(x))) +#define expf(x) exp(((float)(x))) +#define sqrtf(x) sqrt(((float)(x))) +#define logf(x) log(((float)(x))) #endif /* data lookup defines */ diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h new file mode 100644 index 00000000000..bf1bc0e9db8 --- /dev/null +++ b/intern/cycles/kernel/kernel_debug.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void debug_data_init(DebugData *debug_data) +{ + debug_data->num_bvh_traversal_steps = 0; +} + +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathState *state, + DebugData *debug_data, + int sample) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASS_BVH_TRAVERSAL_STEPS) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps, + sample, + debug_data->num_bvh_traversal_steps); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index deffa7f2ba2..4b2bb723ab6 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -63,32 +63,18 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, return eval; } -ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex, - float randt, float randu, float randv, Ray *ray, BsdfEval *eval, - bool *is_lamp, int bounce, int transparent_bounce) +ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, + LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp, + int bounce, int transparent_bounce) { - LightSample ls; - -#ifdef __BRANCHED_PATH__ - if(lindex != LAMP_NONE) { - /* sample position on a specified light */ - light_select(kg, lindex, randu, randv, sd->P, &ls); - } - else -#endif - { - /* sample a light and position on int */ - light_sample(kg, randt, randu, randv, sd->time, sd->P, &ls); - } - - if(ls.pdf == 0.0f) + if(ls->pdf == 0.0f) return false; /* todo: implement */ differential3 dD = differential3_zero(); /* evaluate closure */ - float3 light_eval = direct_emissive_eval(kg, &ls, -ls.D, dD, ls.t, sd->time, bounce, transparent_bounce); + float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce); if(is_zero(light_eval)) return false; @@ -98,49 +84,51 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int #ifdef __VOLUME__ if(sd->prim != PRIM_NONE) - shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); else - shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf); #else - shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); #endif - if(ls.shader & SHADER_USE_MIS) { + if(ls->shader & SHADER_USE_MIS) { /* multiple importance sampling */ - float mis_weight = power_heuristic(ls.pdf, bsdf_pdf); + float mis_weight = power_heuristic(ls->pdf, bsdf_pdf); light_eval *= mis_weight; } - bsdf_eval_mul(eval, light_eval/ls.pdf); + bsdf_eval_mul(eval, light_eval/ls->pdf); #ifdef __PASSES__ /* use visibility flag to skip lights */ - if(ls.shader & SHADER_EXCLUDE_ANY) { - if(ls.shader & SHADER_EXCLUDE_DIFFUSE) + if(ls->shader & SHADER_EXCLUDE_ANY) { + if(ls->shader & SHADER_EXCLUDE_DIFFUSE) eval->diffuse = make_float3(0.0f, 0.0f, 0.0f); - if(ls.shader & SHADER_EXCLUDE_GLOSSY) + if(ls->shader & SHADER_EXCLUDE_GLOSSY) eval->glossy = make_float3(0.0f, 0.0f, 0.0f); - if(ls.shader & SHADER_EXCLUDE_TRANSMIT) + if(ls->shader & SHADER_EXCLUDE_TRANSMIT) eval->transmission = make_float3(0.0f, 0.0f, 0.0f); + if(ls->shader & SHADER_EXCLUDE_SCATTER) + eval->scatter = make_float3(0.0f, 0.0f, 0.0f); } #endif if(bsdf_eval_is_zero(eval)) return false; - if(ls.shader & SHADER_CAST_SHADOW) { + if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(sd->Ng, ls.D) < 0.0f); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); - if(ls.t == FLT_MAX) { + if(ls->t == FLT_MAX) { /* distant light */ - ray->D = ls.D; - ray->t = ls.t; + ray->D = ls->D; + ray->t = ls->t; } else { /* other lights, avoid self-intersection */ - ray->D = ray_offset(ls.P, ls.Ng) - ray->P; + ray->D = ray_offset(ls->P, ls->Ng) - ray->P; ray->D = normalize_len(ray->D, &ray->t); } @@ -153,7 +141,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int } /* return if it's a lamp for shadow pass */ - *is_lamp = (ls.prim == PRIM_NONE && ls.type != LIGHT_BACKGROUND); + *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND); return true; } @@ -201,13 +189,25 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st if(ls.shader & SHADER_EXCLUDE_ANY) { if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) || - ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT))) + ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || + ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) continue; } #endif float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce); +#ifdef __VOLUME__ + if(state->volume_stack[0].shader != SHADER_NONE) { + /* shadow attenuation */ + Ray volume_ray = *ray; + volume_ray.t = ls.t; + float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f); + kernel_volume_shadow(kg, state, &volume_ray, &volume_tp); + L *= volume_tp; + } +#endif + if(!(state->flag & PATH_RAY_MIS_SKIP)) { /* multiple importance sampling, get regular light pdf, * and compute weight with respect to BSDF pdf */ @@ -234,7 +234,8 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) || ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || - ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA))) + ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) || + ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) return make_float3(0.0f, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 7a850844bf2..2a5b7689e57 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -14,6 +14,8 @@ * limitations under the License */ +/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */ + CCL_NAMESPACE_BEGIN /* "Correlated Multi-Jittered Sampling" @@ -35,8 +37,16 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b) /* a must be > 0 and b must be > 1 */ ccl_device_inline int cmj_fast_div_pow2(int a, int b) { -#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER) + kernel_assert(a > 0); + kernel_assert(b > 1); +#if defined(__KERNEL_SSE2__) +# ifdef _MSC_VER + unsigned long ctz; + _BitScanForward(&ctz, b); + return a >> ctz; +# else return a >> __builtin_ctz(b); +# endif #else return a/b; #endif @@ -44,8 +54,15 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b) ccl_device_inline uint cmj_w_mask(uint w) { -#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER) + kernel_assert(w > 1); +#if defined(__KERNEL_SSE2__) +# ifdef _MSC_VER + unsigned long leading_zero; + _BitScanReverse(&leading_zero, w); + return ((1 << (1 + leading_zero)) - 1); +# else return ((1 << (32 - __builtin_clz(w))) - 1); +# endif #else w |= w >> 1; w |= w >> 2; @@ -165,7 +182,8 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) smodm = cmj_fast_mod_pow2(s, m); } else { - sdivm = float_to_int(s * invm); + /* Doing s*inmv gives precision issues here. */ + sdivm = s / m; smodm = s - sdivm*m; } diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index ac432d3fe04..b18f67ad524 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -27,7 +27,7 @@ typedef struct LightSample { float pdf; /* light sampling probability density function */ float eval_fac; /* intensity multiplier */ int object; /* object id for triangle/curve lights */ - int prim; /* primitive id for triangle/curve ligths */ + int prim; /* primitive id for triangle/curve lights */ int shader; /* shader id */ int lamp; /* lamp id */ LightType type; /* type of light */ @@ -167,12 +167,137 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo return disk_light_sample(normalize(P - center), randu, randv)*radius; } -ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv) +/* Uses the following paper: + * + * Carlos Urena et al. + * An Area-Preserving Parametrization for Spherical Rectangles. + * + * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf + */ +ccl_device float3 area_light_sample(float3 P, + float3 light_p, + float3 axisu, float3 axisv, + float randu, float randv, + float *pdf) { - randu = randu - 0.5f; - randv = randv - 0.5f; + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if(z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float z0sq = z0 * z0; + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + float y0sq = y0 * y0; + float y1sq = y1 * y1; + /* Create vectors to four vertices. */ + float3 v00 = make_float3(x0, y0, z0); + float3 v01 = make_float3(x0, y1, z0); + float3 v10 = make_float3(x1, y0, z0); + float3 v11 = make_float3(x1, y1, z0); + /* Compute normals to edges. */ + float3 n0 = normalize(cross(v00, v10)); + float3 n1 = normalize(cross(v10, v11)); + float3 n2 = normalize(cross(v11, v01)); + float3 n3 = normalize(cross(v01, v00)); + /* Compute internal angles (gamma_i). */ + float g0 = acosf(-dot(n0, n1)); + float g1 = acosf(-dot(n1, n2)); + float g2 = acosf(-dot(n2, n3)); + float g3 = acosf(-dot(n3, n0)); + /* Compute predefined constants. */ + float b0 = n0.z; + float b1 = n2.z; + float b0sq = b0 * b0; + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + + /* Compute cu. */ + float au = randu * S + k; + float fu = (cosf(au) * b0 - b1) / sinf(au); + float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); + cu = clamp(cu, -1.0f, 1.0f); + /* Compute xu. */ + float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + xu = clamp(xu, x0, x1); + /* Compute yv. */ + float d = sqrtf(xu * xu + z0sq); + float h0 = y0 / sqrtf(d * d + y0sq); + float h1 = y1 / sqrtf(d * d + y1sq); + float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; + float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; + + *pdf = 1.0f / S; + + /* Transform (xu, yv, z0) to world coords. */ + return P + xu * x + yv * y + z0 * z; +} - return axisu*randu + axisv*randv; +/* TODO(sergey): This is actually a duplicated code from above, but how to avoid + * this without having some nasty function with loads of parameters? + */ +ccl_device float area_light_pdf(float3 P, + float3 light_p, + float3 axisu, float3 axisv) +{ + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if(z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + /* Create vectors to four vertices. */ + float3 v00 = make_float3(x0, y0, z0); + float3 v01 = make_float3(x0, y1, z0); + float3 v10 = make_float3(x1, y0, z0); + float3 v11 = make_float3(x1, y1, z0); + /* Compute normals to edges. */ + float3 n0 = normalize(cross(v00, v10)); + float3 n1 = normalize(cross(v10, v11)); + float3 n2 = normalize(cross(v11, v01)); + float3 n3 = normalize(cross(v01, v00)); + /* Compute internal angles (gamma_i). */ + float g0 = acosf(-dot(n0, n1)); + float g1 = acosf(-dot(n1, n2)); + float g2 = acosf(-dot(n2, n3)); + float g3 = acosf(-dot(n3, n0)); + /* Compute predefined constants. */ + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + return 1.0f / S; } ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls) @@ -276,6 +401,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); ls->eval_fac *= spot_light_attenuation(data1, data2, ls); } + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } else { /* area light */ @@ -286,18 +412,22 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, float3 axisv = make_float3(data2.y, data2.z, data2.w); float3 D = make_float3(data3.y, data3.z, data3.w); - ls->P += area_light_sample(axisu, axisv, randu, randv); + ls->P = area_light_sample(P, ls->P, + axisu, axisv, + randu, randv, + &ls->pdf); + ls->Ng = D; ls->D = normalize_len(ls->P - P, &ls->t); float invarea = data2.x; - ls->eval_fac = 0.25f*invarea; - ls->pdf = invarea; + + if(dot(ls->D, D) > 0.0f) + ls->pdf = 0.0f; } ls->eval_fac *= kernel_data.integrator.inv_pdf_lights; - ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } } @@ -355,8 +485,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->D = D; ls->t = FLT_MAX; + /* compute pdf */ float invarea = data1.w; ls->pdf = invarea/(costheta*costheta*costheta); + if(ls->t != FLT_MAX) + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); + ls->eval_fac = ls->pdf; } else if(type == LIGHT_POINT || type == LIGHT_SPOT) { @@ -386,6 +520,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, if(ls->eval_fac == 0.0f) return false; } + + /* compute pdf */ + if(ls->t != FLT_MAX) + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } else if(type == LIGHT_AREA) { /* area light */ @@ -412,16 +550,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->D = D; ls->Ng = Ng; - ls->pdf = invarea; - ls->eval_fac = 0.25f*ls->pdf; + ls->pdf = area_light_pdf(P, ls->P, axisu, axisv); + ls->eval_fac = 0.25f*invarea; } else return false; - /* compute pdf */ - if(ls->t != FLT_MAX) - ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); - return true; } @@ -457,7 +591,7 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, v = randv*randu; /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader); + triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; @@ -546,11 +680,6 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index) return __float_as_int(data3.x); } -ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls) -{ - lamp_light_sample(kg, index, randu, randv, P, ls); -} - ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt) { /* sample index */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index a80a0033712..c03229f0a3a 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -29,7 +29,6 @@ #include "kernel_accumulate.h" #include "kernel_shader.h" #include "kernel_light.h" -#include "kernel_emission.h" #include "kernel_passes.h" #ifdef __SUBSURFACE__ @@ -42,177 +41,15 @@ #include "kernel_path_state.h" #include "kernel_shadow.h" +#include "kernel_emission.h" +#include "kernel_path_surface.h" +#include "kernel_path_volume.h" -CCL_NAMESPACE_BEGIN - -#ifdef __VOLUME__ - -ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray, - float num_samples_adjust) -{ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, *throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -#endif - - /* sample phase function */ - float phase_pdf; - BsdfEval phase_eval; - float3 phase_omega_in; - differential3 phase_domega_in; - float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); - int label; - - label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, - &phase_omega_in, &phase_domega_in, &phase_pdf); - - if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); - - /* set labels */ - state->ray_pdf = phase_pdf; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif - state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf); - - /* update path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = sd->P; - ray->D = phase_omega_in; - ray->t = FLT_MAX; - -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = phase_domega_in; -#endif - - return true; -} - +#ifdef __KERNEL_DEBUG__ +#include "kernel_debug.h" #endif -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) - -ccl_device void kernel_branched_path_integrate_direct_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights) -{ - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(sample_all_lights) { - /* lamp sampling */ - for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { - int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); - float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); - - if(kernel_data.integrator.pdf_triangles != 0.0f) - num_samples_inv *= 0.5f; - - for(int j = 0; j < num_samples; j++) { - float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - - if(direct_emission(kg, sd, i, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); - } - } - } - } - - /* mesh light sampling */ - if(kernel_data.integrator.pdf_triangles != 0.0f) { - int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); - float num_samples_inv = num_samples_adjust/num_samples; - - if(kernel_data.integrator.num_all_lights) - num_samples_inv *= 0.5f; - - for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); - float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - - /* only sample triangle lights */ - if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); - } - } - } - } - } - else { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - /* sample random light */ - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -} - -#endif +CCL_NAMESPACE_BEGIN ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, float3 throughput, int num_samples, PathState state, PathRadiance *L) @@ -222,11 +59,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, /* intersect scene */ Intersection isect; uint visibility = path_state_ray_visibility(kg, &state); -#ifdef __HAIR__ bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); -#else - bool hit = scene_intersect(kg, &ray, visibility, &isect); -#endif #ifdef __LAMP_MIS__ if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { @@ -255,15 +88,81 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; - ShaderData volume_sd; - VolumeIntegrateResult result = kernel_volume_integrate(kg, &state, - &volume_sd, &volume_ray, L, &throughput, rng); + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f)) - continue; - else - break; +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = kernel_data.integrator.sample_all_lights_indirect; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) + continue; + else + break; + } +#endif } } #endif @@ -281,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, /* setup shading */ ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); - float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT); #ifdef __BRANCHED_PATH__ shader_merge_closures(&sd); @@ -315,7 +214,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -383,187 +282,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, #if defined(__EMISSION__) && defined(__BRANCHED_PATH__) if(kernel_data.integrator.use_direct_light) { bool all = kernel_data.integrator.sample_all_lights_indirect; - kernel_branched_path_integrate_direct_lighting(kg, rng, &sd, &state, throughput, 1.0f, L, all); - } -#endif - - /* no BSDF? we can stop here */ - if(sd.flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - break; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state.ray_t = 0.0f; -#endif - state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, &state, label); - - /* setup ray */ - ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng); - ray.D = bsdf_omega_in; - ray.t = FLT_MAX; -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -#endif + kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all); } -#ifdef __VOLUME__ - else if(sd.flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ - - /* update path state, count as transparent */ - path_state_next(kg, &state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray.P = ray_offset(sd.P, -sd.Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; #endif - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); - } -#endif - else { - /* no bsdf or volume? we're done */ + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; - } - } -} - -ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) -{ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, *throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -#endif - - /* no BSDF? we can stop here */ - if(sd->flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state->ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif - state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); - ray->D = bsdf_omega_in; - - if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ - else - ray->t = FLT_MAX; - -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); -#endif - return true; - } -#ifdef __VOLUME__ - else if(sd->flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ - - /* update path state, count as transparent */ - path_state_next(kg, state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(sd->P, -sd->Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; -#endif - - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); - return true; - } -#endif - else { - /* no bsdf or volume? */ - return false; } } @@ -601,7 +325,68 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance * } } +ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput) +{ + int num_samples = kernel_data.integrator.ao_samples; + float num_samples_inv = 1.0f/num_samples; + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + float3 ao_alpha = shader_bsdf_alpha(kg, sd); + + for(int j = 0; j < num_samples; j++) { + float bsdf_u, bsdf_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float3 ao_D; + float ao_pdf; + + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray light_ray; + float3 ao_shadow; + + light_ray.P = ray_offset(sd->P, sd->Ng); + light_ray.D = ao_D; + light_ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + light_ray.dP = sd->dP; + light_ray.dD = differential3_zero(); + + if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) + path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + } + } +} + #ifdef __SUBSURFACE__ + +#ifdef __VOLUME__ +ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg, + Ray *ray, + VolumeStack *stack) +{ + kernel_assert(kernel_data.integrator.use_volumes); + + Ray volume_ray = *ray; + Intersection isect; + + while(scene_intersect_volume(kg, &volume_ray, &isect)) + { + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); + kernel_volume_stack_enter_exit(kg, &sd, stack); + + /* Move ray forward. */ + volume_ray.P = ray_offset(sd.P, -sd.Ng); + volume_ray.t -= sd.ray_length; + } +} +#endif + ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput) { float bssrdf_probability; @@ -618,6 +403,11 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false); +#ifdef __VOLUME__ + Ray volume_ray = *ray; + bool need_update_volume_stack = kernel_data.integrator.use_volumes && + sd->flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* compute lighting with the BSDF closure */ for(int hit = 0; hit < num_hits; hit++) { @@ -627,12 +417,30 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR; hit_state.rng_offset += PRNG_BOUNCE_NUM; + + kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L); - if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) { + if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) { #ifdef __LAMP_MIS__ hit_state.ray_t = 0.0f; #endif +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + volume_ray.D = normalize_len(hit_ray.P - volume_ray.P, + &volume_ray.t); + + kernel_path_subsurface_update_volume_stack( + kg, + &volume_ray, + hit_state.volume_stack); + + /* Move volume ray forward. */ + volume_ray.P = hit_ray.P; + } +#endif + kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L); /* for render passes, sum and reset indirect light pass variables @@ -657,7 +465,12 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, path_radiance_init(&L, kernel_data.film.use_light_pass); PathState state; - path_state_init(kg, &state, rng, sample); + path_state_init(kg, &state, rng, sample, &ray); + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; + debug_data_init(&debug_data); +#endif /* path iteration */ for(;;) { @@ -682,7 +495,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect); + bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + } #endif #ifdef __LAMP_MIS__ @@ -712,15 +531,81 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; - ShaderData volume_sd; - VolumeIntegrateResult result = kernel_volume_integrate(kg, &state, - &volume_sd, &volume_ray, &L, &throughput, rng); + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f)) - continue; - else - break; +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = false; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } +#endif } } #endif @@ -748,7 +633,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, /* setup shading */ ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); - float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); /* holdout */ @@ -803,7 +688,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -826,134 +711,33 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, break; } #endif - - /* Same as kernel_path_integrate_lighting(kg, rng, &sd, &throughput, &state, &L, &ray), - but for CUDA the function call is slower. */ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd.flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd.time; -#endif - - if(direct_emission(kg, &sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce, state.transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, &state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(&L, throughput, &L_light, shadow, 1.0f, state.bounce, is_lamp); - } - } - } - } -#endif - - if(sd.flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - break; - - /* modify throughput */ - path_radiance_bsdf_bounce(&L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state.ray_t = 0.0f; -#endif - state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, &state, label); - - /* setup ray */ - ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng); - ray.D = bsdf_omega_in; - -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -#endif - } -#ifdef __VOLUME__ - else if(sd.flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ + /* direct lighting */ + kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L); - /* update path state, count as transparent */ - path_state_next(kg, &state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray.P = ray_offset(sd.P, -sd.Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; -#endif - - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); - } -#endif - else { - /* no bsdf or volume? we're done */ + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) break; - } - - /* adjust ray distance for clipping */ - if(state.bounce == 0) - ray.t -= sd.ray_length; /* clipping works through transparent */ - else - ray.t = FLT_MAX; } float3 L_sum = path_radiance_clamp_and_sum(kg, &L); kernel_write_light_passes(kg, buffer, &L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); +#endif + return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } #ifdef __BRANCHED_PATH__ -ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg, +/* branched path tracing: bounce off surface and integrate indirect light */ +ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust, - PathState *state, PathRadiance *L, ccl_global float *buffer) + PathState *state, PathRadiance *L) { -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - bool all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_integrate_direct_lighting(kg, rng, sd, state, throughput, num_samples_adjust, L, all); - } -#endif - for(int i = 0; i< sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; @@ -980,68 +764,102 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals * RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, &bsdf_rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + PathState ps = *state; + float3 tp = throughput; + Ray bsdf_ray; - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray)) continue; - /* modify throughput */ - float3 tp = throughput; - path_radiance_bsdf_bounce(L, &tp, &bsdf_eval, bsdf_pdf, state->bounce, label); + kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); - /* modify path state */ - PathState ps = *state; - path_state_next(kg, &ps, label); + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } +} - /* setup ray */ - Ray bsdf_ray; +#ifdef __SUBSURFACE__ +ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, + ShaderData *sd, + PathRadiance *L, + PathState *state, + RNG *rng, + Ray *ray, + float3 throughput) +{ + for(int i = 0; i< sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; - bsdf_ray.P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); - bsdf_ray.D = bsdf_omega_in; - bsdf_ray.t = FLT_MAX; -#ifdef __RAY_DIFFERENTIALS__ - bsdf_ray.dP = sd->dP; - bsdf_ray.dD = bsdf_domega_in; -#endif -#ifdef __OBJECT_MOTION__ - bsdf_ray.time = sd->time; -#endif + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(*rng, i); + + state->flag |= PATH_RAY_BSSRDF_ANCESTOR; + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = 0; j < num_samples; j++) { + ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); #ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack); + Ray volume_ray = *ray; + bool need_update_volume_stack = kernel_data.integrator.use_volumes && + sd->flag & SD_OBJECT_INTERSECTS_VOLUME; #endif - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + PathState hit_state = *state; - /* set MIS state */ - ps.min_ray_pdf = fminf(bsdf_pdf, FLT_MAX); - ps.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - ps.ray_t = 0.0f; + path_state_branch(&hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng); + volume_ray.D = normalize_len(P - volume_ray.P, + &volume_ray.t); + + kernel_path_subsurface_update_volume_stack( + kg, + &volume_ray, + hit_state.volume_stack); + + /* Move volume ray forward. */ + volume_ray.P = P; + } #endif - kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); +#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all); + } +#endif - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &bssrdf_sd[hit], throughput, num_samples_inv, + &hit_state, L); + } } + + state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR; } } +#endif ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) { @@ -1053,7 +871,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_radiance_init(&L, kernel_data.film.use_light_pass); PathState state; - path_state_init(kg, &state, rng, sample); + path_state_init(kg, &state, rng, sample, &ray); + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; + debug_data_init(&debug_data); +#endif for(;;) { /* intersect scene */ @@ -1077,7 +900,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect); + bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + } #endif #ifdef __VOLUME__ @@ -1085,10 +914,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in if(state.volume_stack[0].shader != SHADER_NONE) { Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); -#ifdef __KERNEL_CPU__ +#ifdef __VOLUME_DECOUPLED__ /* decoupled ray marching only supported on CPU */ - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); /* cache steps along volume for repeated sampling */ VolumeSegment volume_segment; @@ -1098,29 +928,45 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_volume_decoupled_record(kg, &state, &volume_ray, &volume_sd, &volume_segment, heterogeneous); - /* sample scattering */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; + /* direct light sampling */ + if(volume_segment.closure_flag & SD_SCATTER) { + volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); + bool all = kernel_data.integrator.sample_all_lights_direct; - PathState ps = state; - Ray pray = ray; - float3 tp = throughput; + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); + /* indirect light sampling */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - &ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - /* todo: use all-light sampling */ - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) { + for(int j = 0; j < num_samples; j++) { + /* workaround to fix correlation bug in T38710, can find better solution + * in random number generator later, for now this is done here to not impact + * performance of rendering without volumes */ + RNG tmp_rng = cmj_hash(*rng, state.rng_offset); + + PathState ps = state; + Ray pray = ray; + float3 tp = throughput; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + /* scatter sample. if we use distance sampling and take just one + * sample for direct and indirect light, we could share this + * computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); /* for render passes, sum and reset indirect light pass variables @@ -1150,18 +996,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in PathState ps = state; Ray pray = ray; ShaderData volume_sd; - float3 tp = throughput; + float3 tp = throughput * num_samples_inv; /* branch RNG state */ path_state_branch(&ps, j, num_samples); - VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps, - &volume_sd, &volume_ray, &L, &tp, rng); + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous); +#ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { - /* todo: use all-light sampling */ - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) { - kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); + /* todo: support equiangular, MIS and all light sampling. + * alternatively get decoupled ray marching working on the GPU */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { + kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ @@ -1169,6 +1019,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_radiance_reset_indirect(&L); } } +#endif } /* todo: avoid this calculation using decoupled ray marching */ @@ -1205,7 +1056,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* holdout */ #ifdef __HOLDOUT__ - if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK))) { + if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) { if(kernel_data.background.transparent) { float3 holdout_weight; @@ -1245,7 +1096,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -1257,90 +1108,33 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - int num_samples = kernel_data.integrator.ao_samples; - float num_samples_inv = 1.0f/num_samples; - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N); - float3 ao_alpha = shader_bsdf_alpha(kg, &sd); - - for(int j = 0; j < num_samples; j++) { - float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, &state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float3 ao_D; - float ao_pdf; - - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray light_ray; - float3 ao_shadow; - - light_ray.P = ray_offset(sd.P, sd.Ng); - light_ray.D = ao_D; - light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = sd.time; -#endif - light_ray.dP = sd.dP; - light_ray.dD = differential3_zero(); - - if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow)) - path_radiance_accum_ao(&L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state.bounce); - } - } + kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput); } #endif #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { - for(int i = 0; i< sd.num_closure; i++) { - ShaderClosure *sc = &sd.closure[i]; - - if(!CLOSURE_IS_BSSRDF(sc->type)) - continue; - - /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb); - int num_samples = kernel_data.integrator.subsurface_samples; - float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); - - state.flag |= PATH_RAY_BSSRDF_ANCESTOR; - - /* do subsurface scatter step with copy of shader data, this will - * replace the BSSRDF with a diffuse BSDF closure */ - for(int j = 0; j < num_samples; j++) { - ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; - float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - PathState hit_state = state; - - path_state_branch(&hit_state, j, num_samples); - - kernel_branched_path_integrate_lighting(kg, rng, - &bssrdf_sd[hit], throughput, num_samples_inv, - &hit_state, &L, buffer); - } - } - - state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR; - } + kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state, + rng, &ray, throughput); } #endif if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { PathState hit_state = state; - /* lighting */ - kernel_branched_path_integrate_lighting(kg, rng, - &sd, throughput, 1.0f, &hit_state, &L, buffer); +#ifdef __EMISSION__ + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &sd, &hit_state, throughput, 1.0f, &L, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &sd, throughput, 1.0f, &hit_state, &L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -1353,6 +1147,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in ray.P = ray_offset(sd.P, -sd.Ng); ray.t -= sd.ray_length; /* clipping works through transparent */ + +#ifdef __RAY_DIFFERENTIALS__ + ray.dP = sd.dP; + ray.dD.dx = -sd.dI.dx; + ray.dD.dy = -sd.dI.dy; +#endif + #ifdef __VOLUME__ /* enter/exit volume */ kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); @@ -1363,6 +1164,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_write_light_passes(kg, buffer, &L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); +#endif + return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } @@ -1372,11 +1177,8 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uin { float filter_u; float filter_v; -#ifdef __CMJ__ + int num_samples = kernel_data.integrator.aa_samples; -#else - int num_samples = 0; -#endif path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 406654c1741..f29168642a4 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -16,17 +16,13 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample) +ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray) { - state->flag = PATH_RAY_CAMERA|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP; + state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; -#ifdef __CMJ__ state->num_samples = kernel_data.integrator.aa_samples; -#else - state->num_samples = 0; -#endif state->bounce = 0; state->diffuse_bounce = 0; @@ -45,7 +41,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG if(kernel_data.integrator.use_volumes) { /* initialize volume stack with volume we are inside of */ - kernel_volume_stack_init(kg, state->volume_stack); + kernel_volume_stack_init(kg, ray, state->volume_stack); /* seed RNG for cases where we can't use stratified samples */ state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); } @@ -63,8 +59,8 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - /* random number generator next bounce */ - state->rng_offset += PRNG_BOUNCE_NUM; + /* don't increase random number generator offset here, to avoid some + * unwanted patterns, see path_state_rng_1D_for_decision */ if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h new file mode 100644 index 00000000000..9553c2da0df --- /dev/null +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -0,0 +1,299 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) + +/* branched path tracing: connect path directly to position on one or more lights and add it to L */ +ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights) +{ +#ifdef __EMISSION__ + /* sample illumination from lights to find path contribution */ + if(!(sd->flag & SD_BSDF_HAS_EVAL)) + return; + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + if(sample_all_lights) { + /* lamp sampling */ + for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { + int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); + float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); + RNG lamp_rng = cmj_hash(*rng, i); + + if(kernel_data.integrator.pdf_triangles != 0.0f) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + float light_u, light_v; + path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + + /* mesh light sampling */ + if(kernel_data.integrator.pdf_triangles != 0.0f) { + int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); + float num_samples_inv = num_samples_adjust/num_samples; + + if(kernel_data.integrator.num_all_lights) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_u, light_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + /* only sample triangle lights */ + if(kernel_data.integrator.num_all_lights) + light_t = 0.5f*light_t; + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + } + else { + /* sample one light at random */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + /* sample random light */ + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + } + } + } +#endif +} + +/* branched path tracing: bounce off or through surface to with new direction stored in ray */ +ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, + float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* sample BSDF */ + float bsdf_pdf; + BsdfEval bsdf_eval; + float3 bsdf_omega_in; + differential3 bsdf_domega_in; + float bsdf_u, bsdf_v; + path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + int label; + + label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, + &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + + if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + + /* modify path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->D = bsdf_omega_in; + ray->t = FLT_MAX; +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = bsdf_domega_in; +#endif +#ifdef __OBJECT_MOTION__ + ray->time = sd->time; +#endif + +#ifdef __VOLUME__ + /* enter/exit volume */ + if(label & LABEL_TRANSMIT) + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +#endif + + /* branch RNG state */ + path_state_branch(state, sample, num_samples); + + /* set MIS state */ + state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX); + state->ray_pdf = bsdf_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + + return true; +} + +#endif + +/* path tracing: connect path directly to position on a light and add it to L */ +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +{ +#ifdef __EMISSION__ + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) + return; + + /* sample illumination from lights to find path contribution */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } +#endif +} + +/* path tracing: bounce off or through surface to with new direction stored in ray */ +ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* no BSDF? we can stop here */ + if(sd->flag & SD_BSDF) { + /* sample BSDF */ + float bsdf_pdf; + BsdfEval bsdf_eval; + float3 bsdf_omega_in; + differential3 bsdf_domega_in; + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + int label; + + label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, + &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + + if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + + /* set labels */ + if(!(label & LABEL_TRANSPARENT)) { + state->ray_pdf = bsdf_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf); + } + + /* update path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->D = bsdf_omega_in; + + if(state->bounce == 0) + ray->t -= sd->ray_length; /* clipping works through transparent */ + else + ray->t = FLT_MAX; + +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = bsdf_domega_in; +#endif + +#ifdef __VOLUME__ + /* enter/exit volume */ + if(label & LABEL_TRANSMIT) + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +#endif + return true; + } +#ifdef __VOLUME__ + else if(sd->flag & SD_HAS_ONLY_VOLUME) { + /* no surface shader but have a volume shader? act transparent */ + + /* update path state, count as transparent */ + path_state_next(kg, state, LABEL_TRANSPARENT); + + /* setup ray position, direction stays unchanged */ + ray->P = ray_offset(sd->P, -sd->Ng); +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; +#endif + + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); + return true; + } +#endif + else { + /* no bsdf or volume? */ + return false; + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h new file mode 100644 index 00000000000..d8143832294 --- /dev/null +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -0,0 +1,267 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __VOLUME_SCATTER__ + +ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +{ +#ifdef __EMISSION__ + if(!kernel_data.integrator.use_direct_light) + return; + + /* sample illumination from lights to find path contribution */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + Ray light_ray; + BsdfEval L_light; + LightSample ls; + bool is_lamp; + + /* connect to light from given point where shader has been evaluated */ +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + if(ls.pdf == 0.0f) + return; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } +#endif +} + +#ifdef __KERNEL_GPU__ +ccl_device_noinline +#else +ccl_device +#endif +bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* sample phase function */ + float phase_pdf; + BsdfEval phase_eval; + float3 phase_omega_in; + differential3 phase_domega_in; + float phase_u, phase_v; + path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + int label; + + label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, + &phase_omega_in, &phase_domega_in, &phase_pdf); + + if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); + + /* set labels */ + state->ray_pdf = phase_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf); + + /* update path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = sd->P; + ray->D = phase_omega_in; + ray->t = FLT_MAX; + +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = phase_domega_in; +#endif + + return true; +} + +ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L, + float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment) +{ +#ifdef __EMISSION__ + if(!kernel_data.integrator.use_direct_light) + return; + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + if(sample_all_lights) { + /* lamp sampling */ + for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { + int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); + float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); + RNG lamp_rng = cmj_hash(*rng, i); + + if(kernel_data.integrator.pdf_triangles != 0.0f) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + /* sample random position on given light */ + float light_u, light_v; + path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + + if(ls.pdf == 0.0f) + continue; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + + /* mesh light sampling */ + if(kernel_data.integrator.pdf_triangles != 0.0f) { + int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); + float num_samples_inv = num_samples_adjust/num_samples; + + if(kernel_data.integrator.num_all_lights) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + /* sample random position on random triangle */ + float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_u, light_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + /* only sample triangle lights */ + if(kernel_data.integrator.num_all_lights) + light_t = 0.5f*light_t; + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(ls.pdf == 0.0f) + continue; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + } + else { + /* sample random position on random light */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(ls.pdf == 0.0f) + return; + + /* sample random light */ + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } + } +#endif +} + +#endif + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 31cb6ff6abd..236f74c0a82 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -261,22 +261,41 @@ ccl_device uint lcg_init(uint seed) * For branches in the path we must be careful not to reuse the same number * in a sequence and offset accordingly. */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) { return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +{ + /* the rng_offset is not increased for transparent bounces. if we do then + * fully transparent objects can become subtly visible by the different + * sampling patterns used where the transparent object is. + * + * however for some random numbers that will determine if we next bounce + * is transparent we do need to increase the offset to avoid always making + * the same decision */ + int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; + return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); +} + +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +{ + int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; + return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); +} + +ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); } @@ -290,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) { return lcg_init(*rng + state->rng_offset + state->sample*scramble); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 58cec090410..db08c328d7e 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -86,9 +86,8 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, #endif if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - float3 Ng = make_float3(Ns.x, Ns.y, Ns.z); - sd->shader = __float_as_int(Ns.w); + float3 Ng = triangle_normal(kg, sd); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ sd->P = triangle_refine(kg, sd, isect, ray); @@ -166,9 +165,8 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat /* fetch triangle data */ if(sd->type == PRIMITIVE_TRIANGLE) { - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - float3 Ng = make_float3(Ns.x, Ns.y, Ns.z); - sd->shader = __float_as_int(Ns.w); + float3 Ng = triangle_normal(kg, sd); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* static triangle */ sd->P = triangle_refine_subsurface(kg, sd, isect, ray); @@ -342,7 +340,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f); int shader; - triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader); + triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); /* force smooth shading for displacement */ shader |= SHADER_SMOOTH_NORMAL; @@ -609,6 +607,9 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { + if(sd->flag & SD_HAS_ONLY_VOLUME) + return make_float3(1.0f, 1.0f, 1.0f); + float3 eval = make_float3(0.0f, 0.0f, 0.0f); for(int i = 0; i< sd->num_closure; i++) { @@ -797,8 +798,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, #ifdef __SVM__ svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag); #else - sd->closure.weight = make_float3(0.8f, 0.8f, 0.8f); - sd->closure.N = sd->N; + sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f); + sd->closure->N = sd->N; sd->flag |= bsdf_diffuse_setup(&sd->closure); #endif } @@ -857,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con if(phase_pdf != 0.0f) { bsdf_eval_accum(result_eval, sc->type, eval); - sum_pdf += phase_pdf; + sum_pdf += phase_pdf*sc->sample_weight; } sum_sample_weight += sc->sample_weight; @@ -1025,8 +1026,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect #ifdef __HAIR__ if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) { #endif - float4 Ns = kernel_tex_fetch(__tri_normal, prim); - shader = __float_as_int(Ns.w); + shader = kernel_tex_fetch(__tri_shader, prim); #ifdef __HAIR__ } else { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index ab7524c411a..61954282c28 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -64,18 +64,21 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * bool blocked; if(kernel_data.integrator.transparent_shadows) { + /* check transparent bounces here, for volume scatter which can do + * lighting before surface path termination is checked */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) + return true; + /* intersect to find an opaque surface, or record all transparent surface hits */ Intersection hits_stack[STACK_MAX_HITS]; - Intersection *hits; + Intersection *hits = hits_stack; uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1; /* prefer to use stack but use dynamic allocation if too deep max hits * we need max_hits + 1 storage space due to the logic in * scene_intersect_shadow_all which will first store and then check if * the limit is exceeded */ - if(max_hits + 1 <= STACK_MAX_HITS) - hits = hits_stack; - else + if(max_hits + 1 > STACK_MAX_HITS) hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1)); uint num_hits; @@ -152,7 +155,11 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * kernel_volume_shadow(kg, &ps, ray, &throughput); #endif - *shadow *= throughput; + *shadow = throughput; + + if(hits != hits_stack) + free(hits); + return is_zero(throughput); } /* free dynamic storage */ @@ -161,11 +168,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * } else { Intersection isect; -#ifdef __HAIR__ blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); -#else - blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect); -#endif } #ifdef __VOLUME__ @@ -178,6 +181,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return blocked; } +#undef STACK_MAX_HITS + #else /* Shadow function to compute how much light is blocked, GPU variation. @@ -196,11 +201,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return false; Intersection isect; -#ifdef __HAIR__ bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); -#else - bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect); -#endif #ifdef __TRANSPARENT_SHADOWS__ if(blocked && kernel_data.integrator.transparent_shadows) { @@ -216,11 +217,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * if(bounce >= kernel_data.integrator.transparent_max_bounce) return true; -#ifdef __HAIR__ if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) -#else - if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect)) -#endif { #ifdef __VOLUME__ diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 2d5f6091908..740998e8c92 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -34,7 +34,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -64,9 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa /* Shader Evaluate */ -void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 1062fd0c990..da73a3a1c97 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -36,7 +36,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -66,9 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa /* Shader Evaluate */ -void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp index ba3b4887650..5704f60e138 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernel_sse41.cpp @@ -37,7 +37,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -67,9 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo /* Shader Evaluate */ -void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index b07075c6c95..ef46b2f707f 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -36,7 +36,7 @@ KERNEL_TEX(float4, texture_float4, __objects) KERNEL_TEX(float4, texture_float4, __objects_vector) /* triangles */ -KERNEL_TEX(float4, texture_float4, __tri_normal) +KERNEL_TEX(uint, texture_uint, __tri_shader) KERNEL_TEX(float4, texture_float4, __tri_vnormal) KERNEL_TEX(float4, texture_float4, __tri_vindex) KERNEL_TEX(float4, texture_float4, __tri_verts) @@ -49,6 +49,7 @@ KERNEL_TEX(float4, texture_float4, __curve_keys) KERNEL_TEX(uint4, texture_uint4, __attributes_map) KERNEL_TEX(float, texture_float, __attributes_float) KERNEL_TEX(float4, texture_float4, __attributes_float3) +KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4) /* lights */ KERNEL_TEX(float4, texture_float4, __light_distribution) @@ -172,10 +173,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_095) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_096) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) /* Kepler and above */ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102) @@ -227,7 +227,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150) -#endif /* packed image (opencl) */ KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 11445aa1c93..cfac8d1e905 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -38,12 +38,14 @@ CCL_NAMESPACE_BEGIN #define BSSRDF_MIN_RADIUS 1e-8f #define BSSRDF_MAX_HITS 4 -#define BB_DRAPPER 800.0f +#define BB_DRAPER 800.0f #define BB_MAX_TABLE_RANGE 12000.0f #define BB_TABLE_XPOWER 1.5f #define BB_TABLE_YPOWER 5.0f #define BB_TABLE_SPACING 2.0f +#define BECKMANN_TABLE_SIZE 256 + #define TEX_NUM_FLOAT_IMAGES 5 #define SHADER_NONE (~0) @@ -64,6 +66,8 @@ CCL_NAMESPACE_BEGIN #define __SUBSURFACE__ #define __CMJ__ #define __VOLUME__ +#define __VOLUME_DECOUPLED__ +#define __VOLUME_SCATTER__ #define __SHADOW_RECORD_ALL__ #endif @@ -71,10 +75,15 @@ CCL_NAMESPACE_BEGIN #define __KERNEL_SHADING__ #define __KERNEL_ADV_SHADING__ #define __BRANCHED_PATH__ +#define __VOLUME__ +#define __VOLUME_SCATTER__ /* Experimental on GPU */ -//#define __VOLUME__ -//#define __SUBSURFACE__ +#ifdef __KERNEL_CUDA_EXPERIMENTAL__ +#define __SUBSURFACE__ +#define __CMJ__ +#endif + #endif #ifdef __KERNEL_OPENCL__ @@ -101,7 +110,6 @@ CCL_NAMESPACE_BEGIN #define __BACKGROUND_MIS__ #define __LAMP_MIS__ #define __AO__ -#define __ANISOTROPIC__ //#define __CAMERA_MOTION__ //#define __OBJECT_MOTION__ //#define __HAIR__ @@ -132,11 +140,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SHADING__ #define __SVM__ #define __EMISSION__ -#define __PROCEDURAL_TEXTURES__ -#define __IMAGE_TEXTURES__ +#define __TEXTURES__ #define __EXTRA_NODES__ #define __HOLDOUT__ -#define __NORMAL_MAP__ #endif #ifdef __KERNEL_ADV_SHADING__ @@ -146,12 +152,15 @@ CCL_NAMESPACE_BEGIN #define __BACKGROUND_MIS__ #define __LAMP_MIS__ #define __AO__ -#define __ANISOTROPIC__ #define __CAMERA_MOTION__ #define __OBJECT_MOTION__ #define __HAIR__ #endif +#ifdef WITH_CYCLES_DEBUG +# define __KERNEL_DEBUG__ +#endif + /* Random Numbers */ typedef uint RNG; @@ -221,10 +230,9 @@ enum PathTraceDimension { PRNG_PHASE_V = 9, PRNG_PHASE = 10, PRNG_SCATTER_DISTANCE = 11, - PRNG_BOUNCE_NUM = 12, -#else - PRNG_BOUNCE_NUM = 8, #endif + + PRNG_BOUNCE_NUM = 12, }; enum SamplingPattern { @@ -250,17 +258,17 @@ enum PathRayFlag { PATH_RAY_SHADOW_TRANSPARENT = 256, PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/ + PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ /* note that these can use maximum 12 bits, the other are for layers */ - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512), + PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024), - PATH_RAY_MIS_SKIP = 1024, - PATH_RAY_DIFFUSE_ANCESTOR = 2048, - PATH_RAY_GLOSSY_ANCESTOR = 4096, - PATH_RAY_BSSRDF_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_VOLUME_SCATTER = 32768, + PATH_RAY_MIS_SKIP = 2048, + PATH_RAY_DIFFUSE_ANCESTOR = 4096, + PATH_RAY_GLOSSY_ANCESTOR = 8192, + PATH_RAY_BSSRDF_ANCESTOR = 16384, + PATH_RAY_SINGLE_PASS_DONE = 32768, /* we need layer member flags to be the 20 upper bits */ PATH_RAY_LAYER_SHIFT = (32-20) @@ -283,32 +291,35 @@ typedef enum ClosureLabel { typedef enum PassType { PASS_NONE = 0, - PASS_COMBINED = 1, - PASS_DEPTH = 2, - PASS_NORMAL = 4, - PASS_UV = 8, - PASS_OBJECT_ID = 16, - PASS_MATERIAL_ID = 32, - PASS_DIFFUSE_COLOR = 64, - PASS_GLOSSY_COLOR = 128, - PASS_TRANSMISSION_COLOR = 256, - PASS_DIFFUSE_INDIRECT = 512, - PASS_GLOSSY_INDIRECT = 1024, - PASS_TRANSMISSION_INDIRECT = 2048, - PASS_DIFFUSE_DIRECT = 4096, - PASS_GLOSSY_DIRECT = 8192, - PASS_TRANSMISSION_DIRECT = 16384, - PASS_EMISSION = 32768, - PASS_BACKGROUND = 65536, - PASS_AO = 131072, - PASS_SHADOW = 262144, - PASS_MOTION = 524288, - PASS_MOTION_WEIGHT = 1048576, - PASS_MIST = 2097152, - PASS_SUBSURFACE_DIRECT = 4194304, - PASS_SUBSURFACE_INDIRECT = 8388608, - PASS_SUBSURFACE_COLOR = 16777216, - PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */ + PASS_COMBINED = (1 << 0), + PASS_DEPTH = (1 << 1), + PASS_NORMAL = (1 << 2), + PASS_UV = (1 << 3), + PASS_OBJECT_ID = (1 << 4), + PASS_MATERIAL_ID = (1 << 5), + PASS_DIFFUSE_COLOR = (1 << 6), + PASS_GLOSSY_COLOR = (1 << 7), + PASS_TRANSMISSION_COLOR = (1 << 8), + PASS_DIFFUSE_INDIRECT = (1 << 9), + PASS_GLOSSY_INDIRECT = (1 << 10), + PASS_TRANSMISSION_INDIRECT = (1 << 11), + PASS_DIFFUSE_DIRECT = (1 << 12), + PASS_GLOSSY_DIRECT = (1 << 13), + PASS_TRANSMISSION_DIRECT = (1 << 14), + PASS_EMISSION = (1 << 15), + PASS_BACKGROUND = (1 << 16), + PASS_AO = (1 << 17), + PASS_SHADOW = (1 << 18), + PASS_MOTION = (1 << 19), + PASS_MOTION_WEIGHT = (1 << 20), + PASS_MIST = (1 << 21), + PASS_SUBSURFACE_DIRECT = (1 << 22), + PASS_SUBSURFACE_INDIRECT = (1 << 23), + PASS_SUBSURFACE_COLOR = (1 << 24), + PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */ +#ifdef __KERNEL_DEBUG__ + PASS_BVH_TRAVERSAL_STEPS = (1 << 26), +#endif } PassType; #define PASS_ALL (~0) @@ -330,21 +341,25 @@ typedef struct PathRadiance { float3 color_glossy; float3 color_transmission; float3 color_subsurface; + float3 color_scatter; float3 direct_diffuse; float3 direct_glossy; float3 direct_transmission; float3 direct_subsurface; + float3 direct_scatter; float3 indirect_diffuse; float3 indirect_glossy; float3 indirect_transmission; float3 indirect_subsurface; + float3 indirect_scatter; float3 path_diffuse; float3 path_glossy; float3 path_transmission; float3 path_subsurface; + float3 path_scatter; float4 shadow; float mist; @@ -358,6 +373,7 @@ typedef struct BsdfEval { float3 transmission; float3 transparent; float3 subsurface; + float3 scatter; } BsdfEval; #else @@ -378,7 +394,8 @@ typedef enum ShaderFlag { SHADER_EXCLUDE_GLOSSY = (1 << 26), SHADER_EXCLUDE_TRANSMIT = (1 << 25), SHADER_EXCLUDE_CAMERA = (1 << 24), - SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA), + SHADER_EXCLUDE_SCATTER = (1 << 23), + SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA|SHADER_EXCLUDE_SCATTER), SHADER_MASK = ~(SHADER_SMOOTH_NORMAL|SHADER_CAST_SHADOW|SHADER_AREA_LIGHT|SHADER_USE_MIS|SHADER_EXCLUDE_ANY) } ShaderFlag; @@ -390,10 +407,8 @@ typedef enum LightType { LIGHT_DISTANT, LIGHT_BACKGROUND, LIGHT_AREA, - LIGHT_AO, LIGHT_SPOT, - LIGHT_TRIANGLE, - LIGHT_STRAND + LIGHT_TRIANGLE } LightType; /* Camera Type */ @@ -445,6 +460,10 @@ typedef struct Intersection { int prim; int object; int type; + +#ifdef __KERNEL_DEBUG__ + int num_traversal_steps; +#endif } Intersection; /* Primitives */ @@ -478,6 +497,7 @@ typedef enum AttributeElement { ATTR_ELEMENT_VERTEX, ATTR_ELEMENT_VERTEX_MOTION, ATTR_ELEMENT_CORNER, + ATTR_ELEMENT_CORNER_BYTE, ATTR_ELEMENT_CURVE, ATTR_ELEMENT_CURVE_KEY, ATTR_ELEMENT_CURVE_KEY_MOTION, @@ -519,24 +539,32 @@ typedef enum AttributeStandard { #define MAX_CLOSURE 1 #endif +/* TODO(sergey): This is rather nasty bug happening in here, which + * could be simply a compilers bug for which we can't find a generic + * platform independent workaround. Also even if it's a compiler + * issue, it's not so simple to upgrade the compiler in the release + * environment for linux and doing it so closer to the release is + * rather a risky business. + * + * For this release it's probably safer to stick with such a rather + * dirty solution, and look for a cleaner fix during the next release + * cycle. + */ typedef struct ShaderClosure { ClosureType type; float3 weight; - +#ifndef __APPLE__ float sample_weight; - +#endif float data0; float data1; + float data2; float3 N; -#if defined(__ANISOTROPIC__) || defined(__SUBSURFACE__) || defined(__HAIR__) float3 T; +#ifdef __APPLE__ + float sample_weight; #endif - -#ifdef __HAIR__ - float offset; -#endif - #ifdef __OSL__ void *prim; #endif @@ -563,37 +591,49 @@ typedef enum ShaderContext { enum ShaderDataFlag { /* runtime flags */ - SD_BACKFACING = 1, /* backside of surface? */ - SD_EMISSION = 2, /* have emissive closure? */ - SD_BSDF = 4, /* have bsdf closure? */ - SD_BSDF_HAS_EVAL = 8, /* have non-singular bsdf closure? */ - SD_PHASE_HAS_EVAL = 8, /* have non-singular phase closure? */ - SD_BSDF_GLOSSY = 16, /* have glossy bsdf */ - SD_BSSRDF = 32, /* have bssrdf */ - SD_HOLDOUT = 64, /* have holdout closure? */ - SD_ABSORPTION = 128, /* have volume absorption closure? */ - SD_SCATTER = 256, /* have volume phase closure? */ - SD_AO = 512, /* have ao closure? */ - SD_TRANSPARENT = 1024, /* have transparent closure? */ - - SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO), + SD_BACKFACING = (1 << 0), /* backside of surface? */ + SD_EMISSION = (1 << 1), /* have emissive closure? */ + SD_BSDF = (1 << 2), /* have bsdf closure? */ + SD_BSDF_HAS_EVAL = (1 << 3), /* have non-singular bsdf closure? */ + SD_PHASE_HAS_EVAL = (1 << 3), /* have non-singular phase closure? */ + SD_BSDF_GLOSSY = (1 << 4), /* have glossy bsdf */ + SD_BSSRDF = (1 << 5), /* have bssrdf */ + SD_HOLDOUT = (1 << 6), /* have holdout closure? */ + SD_ABSORPTION = (1 << 7), /* have volume absorption closure? */ + SD_SCATTER = (1 << 8), /* have volume phase closure? */ + SD_AO = (1 << 9), /* have ao closure? */ + SD_TRANSPARENT = (1 << 10), /* have transparent closure? */ + + SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY| + SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO), /* shader flags */ - SD_USE_MIS = 2048, /* direct light sample */ - SD_HAS_TRANSPARENT_SHADOW = 4096, /* has transparent shadow */ - SD_HAS_VOLUME = 8192, /* has volume shader */ - SD_HAS_ONLY_VOLUME = 16384, /* has only volume shader, no surface */ - SD_HETEROGENEOUS_VOLUME = 32768, /* has heterogeneous volume */ - SD_HAS_BSSRDF_BUMP = 65536, /* bssrdf normal uses bump */ - - SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|SD_HAS_BSSRDF_BUMP), + SD_USE_MIS = (1 << 11), /* direct light sample */ + SD_HAS_TRANSPARENT_SHADOW = (1 << 12), /* has transparent shadow */ + SD_HAS_VOLUME = (1 << 13), /* has volume shader */ + SD_HAS_ONLY_VOLUME = (1 << 14), /* has only volume shader, no surface */ + SD_HETEROGENEOUS_VOLUME = (1 << 15), /* has heterogeneous volume */ + SD_HAS_BSSRDF_BUMP = (1 << 16), /* bssrdf normal uses bump */ + SD_VOLUME_EQUIANGULAR = (1 << 17), /* use equiangular sampling */ + SD_VOLUME_MIS = (1 << 18), /* use multiple importance sampling */ + SD_VOLUME_CUBIC = (1 << 19), /* use cubic interpolation for voxels */ + + SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME| + SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME| + SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS| + SD_VOLUME_CUBIC), /* object flags */ - SD_HOLDOUT_MASK = 131072, /* holdout for camera rays */ - SD_OBJECT_MOTION = 262144, /* has object motion blur */ - SD_TRANSFORM_APPLIED = 524288, /* vertices have transform applied */ - - SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED) + SD_HOLDOUT_MASK = (1 << 20), /* holdout for camera rays */ + SD_OBJECT_MOTION = (1 << 21), /* has object motion blur */ + SD_TRANSFORM_APPLIED = (1 << 22), /* vertices have transform applied */ + SD_NEGATIVE_SCALE_APPLIED = (1 << 23), /* vertices have negative scale applied */ + SD_OBJECT_HAS_VOLUME = (1 << 24), /* object has a volume shader */ + SD_OBJECT_INTERSECTS_VOLUME = (1 << 25), /* object intersects AABB of an object with volume shader */ + + SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED| + SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME| + SD_OBJECT_INTERSECTS_VOLUME) }; struct KernelGlobals; @@ -686,9 +726,10 @@ typedef struct PathState { int flag; /* random number generator state */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ + int rng_offset; /* dimension offset */ + int rng_offset_bsdf; /* dimension offset for picking bsdf */ + int sample; /* path sample number */ + int num_samples; /* total number of times this path will be sampled */ /* bounce counting */ int bounce; @@ -756,9 +797,12 @@ typedef struct KernelCamera { /* render size */ float width, height; int resolution; - int pad1; + + /* anamorphic lens bokeh */ + float inv_aperture_ratio; + + int is_inside_volume; int pad2; - int pad3; /* more matrices */ Transform screentoworld; @@ -819,6 +863,11 @@ typedef struct KernelFilm { float mist_start; float mist_inv_depth; float mist_falloff; + +#ifdef __KERNEL_DEBUG__ + int pass_bvh_traversal_steps; + int pass_pad3, pass_pad4, pass_pad5; +#endif } KernelFilm; typedef struct KernelBackground { @@ -860,7 +909,8 @@ typedef struct KernelIntegrator { int transparent_shadows; /* caustics */ - int no_caustics; + int caustics_reflective; + int caustics_refractive; float filter_glossy; /* seed */ @@ -892,7 +942,6 @@ typedef struct KernelIntegrator { int aa_samples; /* volume render */ - int volume_homogeneous_sampling; int use_volumes; int volume_max_steps; float volume_step_size; @@ -922,7 +971,6 @@ typedef enum CurveFlag { } CurveFlag; typedef struct KernelCurves { - /* strand intersect and normal parameters - many can be changed to flags */ int curveflags; int subdivisions; @@ -930,11 +978,11 @@ typedef struct KernelCurves { float maximum_width; } KernelCurves; -typedef struct KernelBlackbody { - int table_offset; - int pad1, pad2, pad3; -} KernelBlackbody; - +typedef struct KernelTables { + int blackbody_offset; + int beckmann_offset; + int pad1, pad2; +} KernelTables; typedef struct KernelData { KernelCamera cam; @@ -943,9 +991,17 @@ typedef struct KernelData { KernelIntegrator integrator; KernelBVH bvh; KernelCurves curve; - KernelBlackbody blackbody; + KernelTables tables; } KernelData; +#ifdef __KERNEL_DEBUG__ +typedef struct DebugData { + // Total number of BVH node travesal steps and primitives intersections + // for the camera rays. + int num_bvh_traversal_steps; +} DebugData; +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index faaa68e3309..ce20f20e75a 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -116,6 +116,36 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st return false; } +ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack) +{ + if(kernel_data.integrator.num_all_lights == 0) + return 0; + + int method = -1; + + for(int i = 0; stack[i].shader != SHADER_NONE; i++) { + int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2); + + if(shader_flag & SD_VOLUME_MIS) { + return SD_VOLUME_MIS; + } + else if(shader_flag & SD_VOLUME_EQUIANGULAR) { + if(method == 0) + return SD_VOLUME_MIS; + + method = SD_VOLUME_EQUIANGULAR; + } + else { + if(method == SD_VOLUME_EQUIANGULAR) + return SD_VOLUME_MIS; + + method = 0; + } + } + + return method; +} + /* Volume Shadows * * These functions are used to attenuate shadow rays to lights. Both absorption @@ -136,7 +166,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) { float3 tp = *throughput; - const float tp_eps = 1e-10f; /* todo: this is likely not the right value */ + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; @@ -146,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* compute extinction at the start */ float t = 0.0f; + float3 sum = make_float3(0.0f, 0.0f, 0.0f); + for(int i = 0; i < max_steps; i++) { /* advance to new position */ float new_t = min(ray->t, (i+1) * step); @@ -160,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* compute attenuation over segment */ if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) { - /* todo: we could avoid computing expf() for each step by summing, - * because exp(a)*exp(b) = exp(a+b), but we still want a quick - * tp_eps check too */ - tp *= volume_color_transmittance(sigma_t, new_t - t); - - /* stop if nearly all light blocked */ - if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps) - break; + /* Compute expf() only for every Nth step, to save some calculations + * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */ + + sum += (-sigma_t * (new_t - t)); + if((i & 0x07) == 0) { /* ToDo: Other interval? */ + tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z)); + + /* stop if nearly all light is blocked */ + if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps) + break; + } } /* stop if at the end of the volume */ t = new_t; - if(t == ray->t) + if(t == ray->t) { + /* Update throughput in case we haven't done it above */ + tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z)); break; + } } *throughput = tp; @@ -226,33 +264,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s return pdf; } -ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P) -{ - /* light RNGs */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - /* light sample */ - LightSample ls; - light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls); - if(ls.pdf == 0.0f) - return false; - - *light_P = ls.P; - return true; -} - -ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t) -{ - float3 light_P; - - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return 0.0f; - - return kernel_volume_equiangular_pdf(ray, light_P, sample_t); -} - /* Distance sampling */ ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf) @@ -312,7 +323,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe * the volume shading coefficient for the entire line segment */ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, - RNG *rng) + RNG *rng, bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -323,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba float t = ray->t; float3 new_tp; +#ifdef __VOLUME_SCATTER__ /* randomly scatter, and if we do t is shortened */ if(closure_flag & SD_SCATTER) { /* extinction coefficient */ @@ -330,43 +342,41 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; - float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); - /* decide if we will hit or miss */ - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); - float sample_transmittance = expf(-sample_sigma_t * t); + bool scatter = true; + float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + + if(probalistic_scatter) { + float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); + float sample_transmittance = expf(-sample_sigma_t * t); + + if(1.0f - xi >= sample_transmittance) { + scatter = true; + + /* rescale random number so we can reuse it */ + xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance); - if(xi >= sample_transmittance) { + } + else + scatter = false; + } + + if(scatter) { /* scattering */ float3 pdf; float3 transmittance; float sample_t; - /* rescale random number so we can reuse it */ - xi = (xi - sample_transmittance)/(1.0f - sample_transmittance); - - if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { - /* distance sampling */ - sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf); - } - else { - /* equiangular sampling */ - float3 light_P; - float equi_pdf; - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return VOLUME_PATH_MISSED; - - sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf); - transmittance = volume_color_transmittance(sigma_t, sample_t); - pdf = make_float3(equi_pdf, equi_pdf, equi_pdf); - } + /* distance sampling */ + sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf); /* modifiy pdf for hit/miss decision */ - pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t); + if(probalistic_scatter) + pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t); new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf); t = sample_t; @@ -378,14 +388,16 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba new_tp = *throughput * transmittance / pdf; } } - else if(closure_flag & SD_ABSORPTION) { + else +#endif + if(closure_flag & SD_ABSORPTION) { /* absorption only, no sampling needed */ float3 transmittance = volume_color_transmittance(coeff.sigma_a, t); new_tp = *throughput * transmittance; } /* integrate emission attenuated by extinction */ - if(closure_flag & SD_EMISSION) { + if(L && (closure_flag & SD_EMISSION)) { float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 transmittance = volume_color_transmittance(sigma_t, ray->t); float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t); @@ -408,13 +420,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba return VOLUME_PATH_ATTENUATED; } -/* heterogeneous volume: integrate stepping through the volume until we - * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg, +/* heterogeneous volume distance sampling: integrate stepping through the + * volume until we reach the end, get absorbed entirely, or run out of + * iterations. this does probalistically scatter or get transmitted through + * for path tracing where we don't want to branch. */ +ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng) { float3 tp = *throughput; - const float tp_eps = 1e-10f; /* todo: this is likely not the right value */ + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; @@ -425,9 +439,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float t = 0.0f; float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f); - /* cache some constant variables */ - float xi; - int channel = -1; + /* pick random color channel, we use the Veach one-sample + * model with balance heuristic for the channels */ + float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + int channel = (int)(rphase*3.0f); + sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; for(int i = 0; i < max_steps; i++) { @@ -449,25 +466,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float3 transmittance; bool scatter = false; - /* randomly scatter, and if we do dt and new_t are shortened */ + /* distance sampling */ +#ifdef __VOLUME_SCATTER__ if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) { has_scatter = true; - /* average sigma_t and sigma_s over segment */ float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 sigma_s = coeff.sigma_s; - /* lazily set up variables for sampling */ - if(channel == -1) { - /* pick random color channel, we use the Veach one-sample - * model with balance heuristic for the channels */ - xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); - - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); - channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; - } - /* compute transmittance over full step */ transmittance = volume_color_transmittance(sigma_t, dt); @@ -480,10 +486,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float new_dt = -logf(1.0f - xi)/sample_sigma_t; new_t = t + new_dt; - /* transmittance, throughput */ + /* transmittance and pdf */ float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt); - float pdf = average(sigma_t * new_transmittance); - new_tp = tp * sigma_s * new_transmittance / pdf; + float3 pdf = sigma_t * new_transmittance; + + /* throughput */ + new_tp = tp * sigma_s * new_transmittance / average(pdf); scatter = true; } else { @@ -495,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo xi = 1.0f - (1.0f - xi)/sample_transmittance; } } - else if(closure_flag & SD_ABSORPTION) { + else +#endif + if(closure_flag & SD_ABSORPTION) { /* absorption only, no sampling needed */ float3 sigma_a = coeff.sigma_a; @@ -504,7 +514,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo } /* integrate emission attenuated by absorption */ - if(closure_flag & SD_EMISSION) { + if(L && (closure_flag & SD_EMISSION)) { float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt); path_radiance_accum_emission(L, tp, emission, state->bounce); } @@ -518,19 +528,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo tp = make_float3(0.0f, 0.0f, 0.0f); break; } + } - /* prepare to scatter to new direction */ - if(scatter) { - /* adjust throughput and move to new location */ - sd->P = ray->P + new_t*ray->D; - *throughput = tp; + /* prepare to scatter to new direction */ + if(scatter) { + /* adjust throughput and move to new location */ + sd->P = ray->P + new_t*ray->D; + *throughput = tp; - return VOLUME_PATH_SCATTERED; - } - else { - /* accumulate transmittance */ - accum_transmittance *= transmittance; - } + return VOLUME_PATH_SCATTERED; + } + else { + /* accumulate transmittance */ + accum_transmittance *= transmittance; } } @@ -545,14 +555,34 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo return VOLUME_PATH_ATTENUATED; } +/* get the volume attenuation and emission over line segment defined by + * ray, with the assumption that there are no surfaces blocking light + * between the endpoints. distance sampling is used to decide if we will + * scatter or not. */ +ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, + PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous) +{ + /* workaround to fix correlation bug in T38710, can find better solution + * in random number generator later, for now this is done here to not impact + * performance of rendering without volumes */ + RNG tmp_rng = cmj_hash(*rng, state->rng_offset); + + shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); + + if(heterogeneous) + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng); + else + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true); +} + /* Decoupled Volume Sampling * * VolumeSegment is list of coefficients and transmittance stored at all steps * through a volume. This can then latter be used for decoupled sampling as in: - * "Importance Sampling Techniques for Path Tracing in Participating Media" */ - -/* CPU only because of malloc/free */ -#ifdef __KERNEL_CPU__ + * "Importance Sampling Techniques for Path Tracing in Participating Media" + * + * On the GPU this is only supported for homogeneous volumes (1 step), due to + * no support for malloc/free and too much stack usage with a fix size array. */ typedef struct VolumeStep { float3 sigma_s; /* scatter coefficient */ @@ -571,6 +601,8 @@ typedef struct VolumeSegment { float3 accum_emission; /* accumulated emission at end of segment */ float3 accum_transmittance; /* accumulated transmittance at end of segment */ + + int sampling_method; /* volume sampling method */ } VolumeSegment; /* record volume steps to the end of the volume. @@ -578,10 +610,12 @@ typedef struct VolumeSegment { * it would be nice if we could only record up to the point that we need to scatter, * but the entire segment is needed to do always scattering, rather than probalistically * hitting or missing the volume. if we don't know the transmittance at the end of the - * volume we can't generate stratitied distance samples up to that transmittance */ + * volume we can't generate stratified distance samples up to that transmittance */ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ + /* prepare for volume stepping */ int max_steps; float step_size, random_jitter_offset; @@ -608,6 +642,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta segment->closure_flag = 0; segment->numsteps = 0; + segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps); VolumeStep *step = segment->steps; @@ -669,6 +704,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta t = new_t; if(t == ray->t) break; + + /* stop if nearly all light blocked */ + if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps) + break; } /* store total emission and transmittance */ @@ -698,35 +737,70 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s * scattering, they always scatter if there is any non-zero scattering * coefficient. * - * these also do not do emission or modify throughput. */ + * these also do not do emission or modify throughput. + * + * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, - float3 *throughput, RNG *rng, VolumeSegment *segment) + float3 *throughput, float rphase, float rscatter, + const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter) { - int closure_flag = segment->closure_flag; - - if(!(closure_flag & SD_SCATTER)) - return VOLUME_PATH_MISSED; + kernel_assert(segment->closure_flag & SD_SCATTER); /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; + float xi = rscatter; - float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); + /* probalistic scattering decision based on transmittance */ + if(probalistic_scatter) { + float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel); + + if(1.0f - xi >= sample_transmittance) { + /* rescale random number so we can reuse it */ + xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance); + } + else { + *throughput /= sample_transmittance; + return VOLUME_PATH_MISSED; + } + } VolumeStep *step; float3 transmittance; float pdf, sample_t; + float mis_weight = 1.0f; + bool distance_sample = true; + bool use_mis = false; + + if(segment->sampling_method && light_P) { + if(segment->sampling_method == SD_VOLUME_MIS) { + /* multiple importance sample: randomly pick between + * equiangular and distance sampling strategy */ + if(xi < 0.5f) { + xi *= 2.0f; + } + else { + xi = (xi - 0.5f)*2.0f; + distance_sample = false; + } + + use_mis = true; + } + else { + /* only equiangular sampling */ + distance_sample = false; + } + } /* distance sampling */ - if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { + if(distance_sample) { /* find step in cdf */ step = segment->steps; float prev_t = 0.0f; - float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f); + float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f); if(segment->numsteps > 1) { float prev_cdf = 0.0f; @@ -749,7 +823,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( xi = (xi - prev_cdf)/(step_cdf - prev_cdf); /* pdf for picking step */ - step_pdf = step->cdf_distance - prev_cdf_distance; + step_pdf_distance = step->cdf_distance - prev_cdf_distance; } /* determine range in which we will sample */ @@ -758,35 +832,77 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* sample distance and compute transmittance */ float3 distance_pdf; sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf); - pdf = average(distance_pdf * step_pdf); + + /* modifiy pdf for hit/miss decision */ + if(probalistic_scatter) + distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance; + + pdf = average(distance_pdf * step_pdf_distance); + + /* multiple importance sampling */ + if(use_mis) { + float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t); + mis_weight = 2.0f*power_heuristic(pdf, equi_pdf); + } } /* equi-angular sampling */ else { - /* pick position on light */ - float3 light_P; - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return VOLUME_PATH_MISSED; - /* sample distance */ - sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf); + sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf); /* find step in which sampled distance is located */ step = segment->steps; float prev_t = 0.0f; + float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f); if(segment->numsteps > 1) { - /* todo: optimize using binary search */ - for(int i = 0; i < segment->numsteps-1; i++, step++) { - if(sample_t < step->t) + float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f); + + int numsteps = segment->numsteps; + int high = numsteps - 1; + int low = 0; + int mid; + + while(low < high) { + mid = (low + high) >> 1; + + if(sample_t < step[mid].t) + high = mid; + else if(sample_t >= step[mid + 1].t) + low = mid + 1; + else { + /* found our interval in step[mid] .. step[mid+1] */ + prev_t = step[mid].t; + prev_cdf_distance = step[mid].cdf_distance; + step += mid+1; break; + } + } - prev_t = step->t; + if(low >= numsteps - 1) { + prev_t = step[numsteps - 1].t; + prev_cdf_distance = step[numsteps-1].cdf_distance; + step += numsteps - 1; } + + /* pdf for picking step with distance sampling */ + step_pdf_distance = step->cdf_distance - prev_cdf_distance; } - + + /* determine range in which we will sample */ + float step_t = step->t - prev_t; + float step_sample_t = sample_t - prev_t; + /* compute transmittance */ - transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t); + transmittance = volume_color_transmittance(step->sigma_t, step_sample_t); + + /* multiple importance sampling */ + if(use_mis) { + float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t); + float distance_pdf = average(distance_pdf3 * step_pdf_distance); + mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); + } } /* compute transmittance up to this step */ @@ -794,7 +910,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( transmittance *= (step-1)->accum_transmittance; /* modify throughput */ - *throughput *= step->sigma_s * transmittance / pdf; + *throughput *= step->sigma_s * transmittance * (mis_weight / pdf); /* evaluate shader to create closures at shading point */ if(segment->numsteps > 1) { @@ -810,40 +926,27 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } -#endif - -/* get the volume attenuation and emission over line segment defined by - * ray, with the assumption that there are no surfaces blocking light - * between the endpoints */ -ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, - PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng) +/* decide if we need to use decoupled or not */ +ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state->rng_offset); - bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); - -#if 0 - /* debugging code to compare decoupled ray marching */ - VolumeSegment segment; - - shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); - kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment); - - kernel_volume_decoupled_free(kg, &segment); + /* decoupled ray marching for heterogenous volumes not supported on the GPU, + * which also means equiangular and multiple importance sampling is not + * support for that case */ +#ifdef __KERNEL_GPU__ + if(heterogeneous) + return false; +#endif - return result; -#else - shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); + /* equiangular and multiple importance sampling only implemented for decoupled */ + if(sampling_method != 0) + return true; - if(heterogeneous) - return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng); + /* for all light sampling use decoupled, reusing shader evaluations is + * typically faster in that case */ + if(direct) + return kernel_data.integrator.sample_all_lights_direct; else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng); -#endif + return kernel_data.integrator.sample_all_lights_indirect; } /* Volume Stack @@ -851,17 +954,88 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals * This is an array of object/shared ID's that the current segment of the path * is inside of. */ -ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack) +ccl_device void kernel_volume_stack_init(KernelGlobals *kg, + Ray *ray, + VolumeStack *stack) { - /* todo: this assumes camera is always in air, need to detect when it isn't */ - if(kernel_data.background.volume_shader == SHADER_NONE) { - stack[0].shader = SHADER_NONE; + /* NULL ray happens in the baker, does it need proper initialization of + * camera in volume? + */ + if(!kernel_data.cam.is_inside_volume || ray == NULL) { + /* Camera is guaranteed to be in the air, only take background volume + * into account in this case. + */ + if(kernel_data.background.volume_shader != SHADER_NONE) { + stack[0].shader = kernel_data.background.volume_shader; + stack[0].object = PRIM_NONE; + stack[1].shader = SHADER_NONE; + } + else { + stack[0].shader = SHADER_NONE; + } + return; } - else { + + Ray volume_ray = *ray; + volume_ray.t = FLT_MAX; + + int stack_index = 0, enclosed_index = 0; + int enclosed_volumes[VOLUME_STACK_SIZE]; + + while(stack_index < VOLUME_STACK_SIZE - 1 && + enclosed_index < VOLUME_STACK_SIZE - 1) + { + Intersection isect; + if(!scene_intersect_volume(kg, &volume_ray, &isect)) { + break; + } + + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); + if(sd.flag & SD_HAS_VOLUME) { + if(sd.flag & SD_BACKFACING) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + bool is_enclosed = false; + for(int i = 0; i < enclosed_index; ++i) { + if(enclosed_volumes[i] == sd.object) { + is_enclosed = true; + break; + } + } + if(is_enclosed == false) { + stack[stack_index].object = sd.object; + stack[stack_index].shader = sd.shader; + ++stack_index; + } + } + else { + /* If ray from camera enters the volume, this volume shouldn't + * be added to the stak on exit. + */ + enclosed_volumes[enclosed_index++] = sd.object; + } + } + + /* Move ray forward. */ + volume_ray.P = ray_offset(sd.P, -sd.Ng); + } + /* stack_index of 0 means quick checks outside of the kernel gave false + * positive, nothing to worry about, just we've wasted quite a few of + * ticks just to come into conclusion that camera is in the air. + * + * In this case we're doing the same above -- check whether background has + * volume. + */ + if(stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) { stack[0].shader = kernel_data.background.volume_shader; stack[0].object = PRIM_NONE; stack[1].shader = SHADER_NONE; } + else { + stack[stack_index].shader = SHADER_NONE; + } } ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack) @@ -910,4 +1084,3 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd } CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript index 4685bb7753e..d721edbaf6e 100644 --- a/intern/cycles/kernel/osl/SConscript +++ b/intern/cycles/kernel/osl/SConscript @@ -43,6 +43,9 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {') defs.append('CCL_NAMESPACE_END=}') defs.append('WITH_OSL') +if env['WITH_BF_CYCLES_DEBUG']: + defs.append('WITH_CYCLES_DEBUG') + if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'): cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split()) incs.append(env['BF_PTHREADS_INC']) diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 94337290d20..84ef85e089d 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -66,18 +66,6 @@ ClosureParam *closure_bssrdf_cubic_params() static ClosureParam params[] = { CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N), CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius), - //CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1), - CLOSURE_STRING_KEYPARAM("label"), - CLOSURE_FINISH_PARAM(CubicBSSRDFClosure) - }; - return params; -} - -ClosureParam *closure_bssrdf_cubic_extended_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N), - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius), CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1), CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x), CLOSURE_STRING_KEYPARAM("label"), @@ -107,18 +95,6 @@ ClosureParam *closure_bssrdf_gaussian_params() static ClosureParam params[] = { CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N), CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius), - //CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1), - CLOSURE_STRING_KEYPARAM("label"), - CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure) - }; - return params; -} - -ClosureParam *closure_bssrdf_gaussian_extended_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N), - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius), CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1), CLOSURE_STRING_KEYPARAM("label"), CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure) diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index a96c0e2b1fb..cc9942b024e 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -41,6 +41,8 @@ #include "util_param.h" #include "kernel_types.h" +#include "kernel_compat_cpu.h" +#include "kernel_globals.h" #include "kernel_montecarlo.h" #include "closure/bsdf_util.h" @@ -51,8 +53,7 @@ #include "closure/bsdf_reflection.h" #include "closure/bsdf_refraction.h" #include "closure/bsdf_transparent.h" -#include "closure/bsdf_ward.h" -#include "closure/bsdf_westin.h" +#include "closure/bsdf_ashikhmin_shirley.h" #include "closure/bsdf_toon.h" #include "closure/bsdf_hair.h" #include "closure/volume.h" @@ -85,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR) CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0), BSDF_CLOSURE_CLASS_END(Refraction, refraction) -BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N), - CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0), -BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter) - -BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE) - CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N), - CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0), -BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen) - BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR) BSDF_CLOSURE_CLASS_END(Transparent, transparent) @@ -103,12 +94,12 @@ BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LA CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0), BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet) -BSDF_CLOSURE_CLASS_BEGIN(Ward, ward, ward, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(WardClosure, sc.N), - CLOSURE_FLOAT3_PARAM(WardClosure, sc.T), - CLOSURE_FLOAT_PARAM(WardClosure, sc.data0), - CLOSURE_FLOAT_PARAM(WardClosure, sc.data1), -BSDF_CLOSURE_CLASS_END(Ward, ward) +BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N), + CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T), + CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0), + CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso) BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, diffuse_toon, LABEL_DIFFUSE) CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, sc.N), @@ -122,26 +113,40 @@ BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY) CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1), BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT) CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0), BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0), + CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso) + +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT) CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0), BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N), + CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso) + +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT) CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0), - CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data1), + CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2), BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT) CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0), - CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data1), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2), BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL_GLOSSY) @@ -150,7 +155,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), #ifdef __HAIR__ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T), - CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset), + CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2), #else CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), @@ -163,7 +168,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission, CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1), #ifdef __HAIR__ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T), - CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset), + CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2), #else CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), @@ -210,26 +215,24 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_transparent_params(), bsdf_transparent_prepare); register_closure(ss, "microfacet_ggx", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare); + register_closure(ss, "microfacet_ggx_aniso", id++, + bsdf_microfacet_ggx_aniso_params(), bsdf_microfacet_ggx_aniso_prepare); register_closure(ss, "microfacet_ggx_refraction", id++, bsdf_microfacet_ggx_refraction_params(), bsdf_microfacet_ggx_refraction_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); + register_closure(ss, "microfacet_beckmann_aniso", id++, + bsdf_microfacet_beckmann_aniso_params(), bsdf_microfacet_beckmann_aniso_prepare); register_closure(ss, "microfacet_beckmann_refraction", id++, bsdf_microfacet_beckmann_refraction_params(), bsdf_microfacet_beckmann_refraction_prepare); - register_closure(ss, "ward", id++, - bsdf_ward_params(), bsdf_ward_prepare); + register_closure(ss, "ashikhmin_shirley", id++, + bsdf_ashikhmin_shirley_aniso_params(), bsdf_ashikhmin_shirley_aniso_prepare); register_closure(ss, "ashikhmin_velvet", id++, bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare); register_closure(ss, "diffuse_toon", id++, bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); - register_closure(ss, "specular_toon", id++, - bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); - register_closure(ss, "westin_backscatter", id++, - bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare); - register_closure(ss, "westin_sheen", id++, - bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -247,10 +250,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare); register_closure(ss, "bssrdf_gaussian", id++, closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); - register_closure(ss, "bssrdf_cubic", id++, - closure_bssrdf_cubic_extended_params(), closure_bssrdf_cubic_prepare); - register_closure(ss, "bssrdf_gaussian", id++, - closure_bssrdf_gaussian_extended_params(), closure_bssrdf_gaussian_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index 218cf1c19cc..5e833d738d8 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -48,12 +48,8 @@ OSL::ClosureParam *closure_holdout_params(); OSL::ClosureParam *closure_ambient_occlusion_params(); OSL::ClosureParam *closure_bsdf_diffuse_ramp_params(); OSL::ClosureParam *closure_bsdf_phong_ramp_params(); -OSL::ClosureParam *closure_westin_backscatter_params(); -OSL::ClosureParam *closure_westin_sheen_params(); OSL::ClosureParam *closure_bssrdf_cubic_params(); OSL::ClosureParam *closure_bssrdf_gaussian_params(); -OSL::ClosureParam *closure_bssrdf_cubic_extended_params(); -OSL::ClosureParam *closure_bssrdf_gaussian_extended_params(); OSL::ClosureParam *closure_henyey_greenstein_volume_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); @@ -62,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data); void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data); -void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data); -void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); @@ -149,17 +143,18 @@ public: \ \ void blur(float roughness) \ { \ - bsdf_##svmlower##_blur(&sc, roughness); \ } \ \ float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - return bsdf_##svmlower##_eval_reflect(&sc, omega_out, omega_in, &pdf); \ + pdf = 0; \ + return make_float3(0, 0, 0); \ } \ \ float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - return bsdf_##svmlower##_eval_transmit(&sc, omega_out, omega_in, &pdf); \ + pdf = 0; \ + return make_float3(0, 0, 0); \ } \ \ int sample(const float3 &Ng, \ @@ -168,8 +163,8 @@ public: \ float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy, \ float &pdf, float3 &eval) const \ { \ - return bsdf_##svmlower##_sample(&sc, Ng, omega_out, domega_out_dx, domega_out_dy, \ - randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf); \ + pdf = 0; \ + return LABEL_NONE; \ } \ }; \ \ diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 5a658d8244a..9c3134e41c9 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -20,7 +20,6 @@ #ifdef WITH_OSL #include <OSL/oslexec.h> -#include <cmath> #include "util_map.h" #include "util_param.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 54894ea19eb..a9694651e14 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -126,7 +126,7 @@ void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::Texture osl_ts = osl_ts_; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -156,7 +156,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -186,7 +186,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float time) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time) { KernelGlobals *kg = kernel_globals; @@ -218,7 +218,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time) { KernelGlobals *kg = kernel_globals; @@ -250,7 +250,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -275,7 +275,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -300,7 +300,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from) { KernelGlobals *kg = kernel_globals; @@ -328,7 +328,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from) return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to) { KernelGlobals *kg = kernel_globals; @@ -356,7 +356,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to) return false; } -bool OSLRenderServices::get_array_attribute(void *renderstate, bool derivatives, +bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, TypeDesc type, ustring name, int index, void *val) { @@ -479,7 +479,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val) static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, void *val) { - if(type.basetype == TypeDesc::INT && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) { + if(type.basetype == TypeDesc::STRING && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) { ustring *sval = (ustring *)val; sval[0] = str; @@ -718,7 +718,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_transparent_depth) { - /* Ray Depth */ + /* Transparent Ray Depth */ int f = sd->transparent_depth; return set_attribute_int(f, type, derivatives, val); } @@ -751,14 +751,22 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * return false; } -bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustring object_name, +bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, + TypeDesc type, ustring name, void *val) +{ + if (sg->renderstate == NULL) + return false; + + ShaderData *sd = (ShaderData *)(sg->renderstate); + return get_attribute(sd, derivatives, object_name, type, name, val); +} + +bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - ShaderData *sd = (ShaderData *)renderstate; KernelGlobals *kg = sd->osl_globals; bool is_curve; int object; - // int prim; /* lookup of attribute on another object */ if (object_name != u_empty) { @@ -768,12 +776,10 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri return false; object = it->second; - // prim = PRIM_NONE; is_curve = false; } else { object = sd->object; - // prim = sd->prim; is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0; if (object == OBJECT_NONE) @@ -815,12 +821,12 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri } bool OSLRenderServices::get_userdata(bool derivatives, ustring name, TypeDesc type, - void *renderstate, void *val) + OSL::ShaderGlobals *sg, void *val) { return false; /* disabled by lockgeom */ } -bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, void *renderstate) +bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg) { return false; /* never called by OSL */ } @@ -871,14 +877,30 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options, return true; } #endif + bool status; - OSLThreadData *tdata = kg->osl_tdata; - OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; + if(filename[0] == '@' && filename.find('.') == -1) { + int slot = atoi(filename.c_str() + 1); + float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); - OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + result[0] = rgba[0]; + if(options.nchannels > 1) + result[1] = rgba[1]; + if(options.nchannels > 2) + result[2] = rgba[2]; + if(options.nchannels > 3) + result[3] = rgba[3]; + status = true; + } + else { + OSLThreadData *tdata = kg->osl_tdata; + OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; - bool status = ts->texture(th, thread_info, - options, s, t, dsdx, dtdx, dsdy, dtdy, result); + OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + + status = ts->texture(th, thread_info, + options, s, t, dsdx, dtdx, dsdy, dtdy, result); + } if(!status) { if(options.nchannels == 3 || options.nchannels == 4) { @@ -953,7 +975,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options, return status; } -bool OSLRenderServices::get_texture_info(ustring filename, int subimage, +bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage, ustring dataname, TypeDesc datatype, void *data) { @@ -996,7 +1018,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, ray.P = TO_FLOAT3(P); ray.D = TO_FLOAT3(R); - ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist; + ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist; ray.time = sd->time; if(options.mindist == 0.0f) { @@ -1025,11 +1047,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->sd.osl_globals = sd->osl_globals; /* raytrace */ -#ifdef __HAIR__ return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); -#else - return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect); -#endif } diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 069722d81b6..6f928a0d103 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -49,27 +49,29 @@ public: void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts); - bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); - bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); - bool get_matrix(OSL::Matrix44 &result, ustring from, float time); - bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time); - bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform); - bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform); - bool get_matrix(OSL::Matrix44 &result, ustring from); - bool get_inverse_matrix(OSL::Matrix44 &result, ustring from); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from); - bool get_array_attribute(void *renderstate, bool derivatives, + bool get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, TypeDesc type, ustring name, int index, void *val); - bool get_attribute(void *renderstate, bool derivatives, ustring object, + bool get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, + TypeDesc type, ustring name, void *val); + bool get_attribute(ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val); bool get_userdata(bool derivatives, ustring name, TypeDesc type, - void *renderstate, void *val); - bool has_userdata(ustring name, TypeDesc type, void *renderstate); + OSL::ShaderGlobals *sg, void *val); + bool has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg); int pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 ¢er, float radius, int max_points, bool sort, size_t *out_indices, @@ -106,7 +108,7 @@ public: OSL::ShaderGlobals *sg, const OSL::Vec3 &R, const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result); - bool get_texture_info(ustring filename, int subimage, + bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage, ustring dataname, TypeDesc datatype, void *data); static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name, @@ -157,6 +159,70 @@ public: static ustring u_v; static ustring u_empty; +#if OSL_LIBRARY_VERSION_CODE < 10500 + bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { + return get_matrix(NULL, result, xform, time); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { + return get_inverse_matrix(NULL, result, xform, time); + } + + bool get_matrix(OSL::Matrix44 &result, ustring from, float time) { + return get_matrix(NULL, result, from, time); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) { + return get_inverse_matrix(NULL, result, to, time); + } + + bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) { + return get_matrix(NULL, result, xform); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) { + return get_inverse_matrix(NULL, result, xform); + } + + bool get_matrix(OSL::Matrix44 &result, ustring from) { + return get_matrix(NULL, result, from); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) { + return get_inverse_matrix(NULL, result, to); + } + + bool get_array_attribute(void *renderstate, bool derivatives, + ustring object, TypeDesc type, ustring name, + int index, void *val) { + OSL::ShaderGlobals sg; + sg.renderstate = renderstate; + return get_array_attribute(&sg, derivatives, + object, type, name, + index, val); + } + + bool get_attribute(void *renderstate, bool derivatives, ustring object_name, + TypeDesc type, ustring name, void *val) { + OSL::ShaderGlobals sg; + sg.renderstate = renderstate; + return get_attribute(&sg, derivatives, object_name, type, name, val); + } + + bool has_userdata(ustring name, TypeDesc type, void *renderstate) { + return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate); + } + + bool get_userdata(bool derivatives, ustring name, TypeDesc type, + void *renderstate, void *val) { + return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val); + } + + bool get_texture_info(ustring filename, int subimage, + ustring dataname, TypeDesc datatype, void *data) { + return get_texture_info(NULL, filename, subimage, dataname, datatype, data); + } +#endif private: KernelGlobals *kernel_globals; OSL::TextureSystem *osl_ts; diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 843dcdd0985..ca0c2cc4415 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -14,6 +14,8 @@ * limitations under the License */ +#include <OSL/oslexec.h> + #include "kernel_compat_cpu.h" #include "kernel_montecarlo.h" #include "kernel_types.h" @@ -34,7 +36,6 @@ #include "attribute.h" -#include <OSL/oslexec.h> CCL_NAMESPACE_BEGIN @@ -164,11 +165,14 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, CBSDFClosure *bsdf = (CBSDFClosure *)prim; int scattering = bsdf->scattering(); - /* no caustics option */ - if(scattering == LABEL_GLOSSY && (path_flag & PATH_RAY_DIFFUSE)) { + /* caustic options */ + if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) { KernelGlobals *kg = sd->osl_globals; - if(kernel_data.integrator.no_caustics) + + if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) || + (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) { return; + } } /* sample weight */ @@ -181,12 +185,9 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.T = bsdf->sc.T; sc.data0 = bsdf->sc.data0; sc.data1 = bsdf->sc.data1; + sc.data2 = bsdf->sc.data2; sc.prim = bsdf->sc.prim; -#ifdef __HAIR__ - sc.offset = bsdf->sc.offset; -#endif - /* add */ if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { sd->closure[sd->num_closure++] = sc; @@ -202,6 +203,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_EMISSION_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; /* flag */ @@ -219,6 +221,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_AMBIENT_OCCLUSION_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; if(sd->num_closure < MAX_CLOSURE) { @@ -232,6 +235,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_HOLDOUT_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; if(sd->num_closure < MAX_CLOSURE) { diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index 5518d652bf9..0b735ede701 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -4,6 +4,7 @@ set(SRC_OSL node_add_closure.osl node_ambient_occlusion.osl + node_anisotropic_bsdf.osl node_attribute.osl node_background.osl node_brick_texture.osl @@ -13,6 +14,7 @@ set(SRC_OSL node_checker_texture.osl node_combine_rgb.osl node_combine_hsv.osl + node_combine_xyz.osl node_convert_from_color.osl node_convert_from_float.osl node_convert_from_int.osl @@ -57,6 +59,7 @@ set(SRC_OSL node_rgb_ramp.osl node_separate_rgb.osl node_separate_hsv.osl + node_separate_xyz.osl node_set_normal.osl node_sky_texture.osl node_subsurface_scattering.osl @@ -71,7 +74,6 @@ set(SRC_OSL node_vector_transform.osl node_velvet_bsdf.osl node_voronoi_texture.osl - node_ward_bsdf.osl node_wavelength.osl node_blackbody.osl node_wave_texture.osl diff --git a/intern/cycles/kernel/shaders/node_ward_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl index 2d360d594f2..da1e4f77107 100644 --- a/intern/cycles/kernel/shaders/node_ward_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl @@ -16,8 +16,9 @@ #include "stdosl.h" -shader node_ward_bsdf( +shader node_anisotropic_bsdf( color Color = 0.0, + string distribution = "GGX", float Roughness = 0.0, float Anisotropy = 0.0, float Rotation = 0.0, @@ -44,6 +45,13 @@ shader node_ward_bsdf( RoughnessV = Roughness / (1.0 - aniso); } - BSDF = Color * ward(Normal, T, RoughnessU, RoughnessV); + if (distribution == "Sharp") + BSDF = Color * reflection(Normal); + else if (distribution == "Beckmann") + BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV); + else if (distribution == "GGX") + BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV); + else + BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV); } diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl index 70a6a6ea7ce..c9fb3542aef 100644 --- a/intern/cycles/kernel/shaders/node_brick_texture.osl +++ b/intern/cycles/kernel/shaders/node_brick_texture.osl @@ -93,6 +93,6 @@ shader node_brick_texture( Col[2] = facm * (Color1[2]) + tint * Color2[2]; } - Color = (Fac == 1.0) ? Mortar: Col; + Color = (Fac == 1.0) ? Mortar : Col; } diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl index 6723076723c..a6d21fd36f3 100644 --- a/intern/cycles/kernel/shaders/node_checker_texture.osl +++ b/intern/cycles/kernel/shaders/node_checker_texture.osl @@ -21,9 +21,9 @@ float checker(point p) { - p[0] = (p[0] + 0.00001) * 0.9999; - p[1] = (p[1] + 0.00001) * 0.9999; - p[2] = (p[2] + 0.00001) * 0.9999; + p[0] = (p[0] + 0.000001) * 0.999999; + p[1] = (p[1] + 0.000001) * 0.999999; + p[2] = (p[2] + 0.000001) * 0.999999; int xi = (int)fabs(floor(p[0])); int yi = (int)fabs(floor(p[1])); diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl new file mode 100644 index 00000000000..933dee5bd78 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include "stdosl.h" + +shader node_combine_xyz( + float X = 0.0, + float Y = 0.0, + float Z = 0.0, + output vector Vector = 0.8) +{ + Vector = vector(X, Y, Z); +} + diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl index 2428da5ef4e..b28d731c19f 100644 --- a/intern/cycles/kernel/shaders/node_emission.osl +++ b/intern/cycles/kernel/shaders/node_emission.osl @@ -17,14 +17,10 @@ #include "stdosl.h" shader node_emission( - int TotalPower = 0, color Color = 0.8, float Strength = 1.0, output closure color Emission = 0) { - if (TotalPower) - Emission = ((Strength / surfacearea()) * Color) * emission(); - else - Emission = (Strength * Color) * emission(); + Emission = (Strength * Color) * emission(); } diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h index 447a84255ef..d192c5d02de 100644 --- a/intern/cycles/kernel/shaders/node_fresnel.h +++ b/intern/cycles/kernel/shaders/node_fresnel.h @@ -34,3 +34,16 @@ float fresnel_dielectric_cos(float cosi, float eta) return result; } +color fresnel_conductor(float cosi, color eta, color k) +{ + color cosi2 = color(cosi * cosi); + color one = color(1, 1, 1); + color tmp_f = eta * eta + k * k; + color tmp = tmp_f * cosi2; + color Rparl2 = (tmp - (2.0 * eta * cosi) + one) / + (tmp + (2.0 * eta * cosi) + one); + color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) / + (tmp_f + (2.0 * eta * cosi) + cosi2); + return (Rparl2 + Rperp2) * 0.5; +} + diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl index dbdf55802ae..cd68f07b21e 100644 --- a/intern/cycles/kernel/shaders/node_geometry.osl +++ b/intern/cycles/kernel/shaders/node_geometry.osl @@ -49,12 +49,8 @@ shader node_geometry( /* try to create spherical tangent from generated coordinates */ if (getattribute("geom:generated", generated)) { - matrix project = matrix(0.0, 1.0, 0.0, 0.0, - -1.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.5, -0.5, 0.0, 1.0); - - vector T = transform("object", "world", transform(project, generated)); + normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0); + vector T = transform("object", "world", data); Tangent = cross(Normal, normalize(cross(T, Normal))); } else { diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl index b4e0fe62223..5c727ca6917 100644 --- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl @@ -19,7 +19,7 @@ shader node_glossy_bsdf( color Color = 0.8, - string distribution = "Beckmann", + string distribution = "GGX", float Roughness = 0.2, normal Normal = N, output closure color BSDF = 0) @@ -30,6 +30,8 @@ shader node_glossy_bsdf( BSDF = Color * microfacet_beckmann(Normal, Roughness); else if (distribution == "GGX") BSDF = Color * microfacet_ggx(Normal, Roughness); + else + BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness); } diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl index 7238a1e8862..18b5fb4b31f 100644 --- a/intern/cycles/kernel/shaders/node_image_texture.osl +++ b/intern/cycles/kernel/shaders/node_image_texture.osl @@ -113,6 +113,10 @@ shader node_image_texture( weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0); } } + else { + /* Desperate mode, no valid choice anyway, fallback to one side.*/ + weight[0] = 1.0; + } Color = color(0.0, 0.0, 0.0); Alpha = 0.0; diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl index 60762539002..a32c3d4b1b8 100644 --- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl +++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl @@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float int i; for (i = 0; i < (int)octaves; i++) { - value += safe_noise(p, 0) * pwr; + value += safe_noise(p, "signed") * pwr; pwr *= pwHL; p *= lacunarity; } rmd = octaves - floor(octaves); if (rmd != 0.0) - value += rmd * safe_noise(p, 0) * pwr; + value += rmd * safe_noise(p, "signed") * pwr; return value; } @@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar int i; for (i = 0; i < (int)octaves; i++) { - value *= (pwr * safe_noise(p, 0) + 1.0); + value *= (pwr * safe_noise(p, "signed") + 1.0); pwr *= pwHL; p *= lacunarity; } rmd = octaves - floor(octaves); if (rmd != 0.0) - value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */ + value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */ return value; } @@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna int i; /* first unscaled octave of function; later octaves are scaled */ - value = offset + safe_noise(p, 0); + value = offset + safe_noise(p, "signed"); p *= lacunarity; for (i = 1; i < (int)octaves; i++) { - increment = (safe_noise(p, 0) + offset) * pwr * value; + increment = (safe_noise(p, "signed") + offset) * pwr * value; value += increment; pwr *= pwHL; p *= lacunarity; @@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna rmd = octaves - floor(octaves); if (rmd != 0.0) { - increment = (safe_noise(p, 0) + offset) * pwr * value; + increment = (safe_noise(p, "signed") + offset) * pwr * value; value += rmd * increment; } @@ -126,7 +126,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, float pwr = pwHL; int i; - result = safe_noise(p, 0) + offset; + result = safe_noise(p, "signed") + offset; weight = gain * result; p *= lacunarity; @@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, if (weight > 1.0) weight = 1.0; - signal = (safe_noise(p, 0) + offset) * pwr; + signal = (safe_noise(p, "signed") + offset) * pwr; pwr *= pwHL; result += weight * signal; weight *= gain * signal; @@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, rmd = octaves - floor(octaves); if (rmd != 0.0) - result += rmd * ((safe_noise(p, 0) + offset) * pwr); + result += rmd * ((safe_noise(p, "signed") + offset) * pwr); return result; } @@ -164,7 +164,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H, float pwr = pwHL; int i; - signal = offset - fabs(safe_noise(p, 0)); + signal = offset - fabs(safe_noise(p, "signed")); signal *= signal; result = signal; weight = 1.0; @@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H, for (i = 1; i < (int)octaves; i++) { p *= lacunarity; weight = clamp(signal * gain, 0.0, 1.0); - signal = offset - fabs(safe_noise(p, 0)); + signal = offset - fabs(safe_noise(p, "signed")); signal *= signal; signal *= weight; result += signal * pwr; diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl new file mode 100644 index 00000000000..63725cb9995 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include "stdosl.h" + +shader node_separate_xyz( + vector Vector = 0.8, + output float X = 0.0, + output float Y = 0.0, + output float Z = 0.0) +{ + X = Vector[0]; + Y = Vector[1]; + Z = Vector[2]; +} diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h index de51559f297..2710eed414a 100644 --- a/intern/cycles/kernel/shaders/node_texture.h +++ b/intern/cycles/kernel/shaders/node_texture.h @@ -153,12 +153,12 @@ float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; } /* Noise Bases */ -float safe_noise(point p, int type) +float safe_noise(point p, string type) { float f = 0.0; /* Perlin noise in range -1..1 */ - if (type == 0) + if (type == "signed") f = noise("perlin", p); /* Perlin noise in range 0..1 */ @@ -175,7 +175,7 @@ float safe_noise(point p, int type) float noise_basis(point p, string basis) { if (basis == "Perlin") - return safe_noise(p, 1); + return safe_noise(p, "unsigned"); if (basis == "Voronoi F1") return voronoi_F1S(p); if (basis == "Voronoi F2") diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index 6f824ea8ebd..1ff8f363b49 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -476,17 +476,17 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN; closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN; closure color diffuse_toon(normal N, float size, float smooth) BUILTIN; closure color glossy_toon(normal N, float size, float smooth) BUILTIN; -closure color westin_backscatter(normal N, float roughness) BUILTIN; -closure color westin_sheen(normal N, float edginess) BUILTIN; closure color translucent(normal N) BUILTIN; closure color reflection(normal N) BUILTIN; closure color refraction(normal N, float eta) BUILTIN; closure color transparent() BUILTIN; closure color microfacet_ggx(normal N, float ag) BUILTIN; +closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; +closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; -closure color ward(normal N, vector T,float ax, float ay) BUILTIN; +closure color ashikhmin_shirley(normal N, vector T,float ax, float ay) BUILTIN; closure color ashikhmin_velvet(normal N, float sigma) BUILTIN; closure color emission() BUILTIN; closure color background() BUILTIN; @@ -505,12 +505,8 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve closure color henyey_greenstein(float g) BUILTIN; closure color absorption() BUILTIN; -// Backwards compatibility -closure color bssrdf_cubic(normal N, vector radius) BUILTIN; -closure color bssrdf_gaussian(normal N, vector radius) BUILTIN; -closure color specular_toon(normal N, float size, float smooth) BUILTIN; - // Renderer state +int backfacing () BUILTIN; int raytype (string typename) BUILTIN; // the individual 'isFOOray' functions are deprecated int iscameraray () { return raytype("camera"); } diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index dbf59c60cb0..c13eae813d6 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -167,8 +167,8 @@ CCL_NAMESPACE_END #include "svm_math.h" #include "svm_mix.h" #include "svm_ramp.h" -#include "svm_sepcomb_rgb.h" #include "svm_sepcomb_hsv.h" +#include "svm_sepcomb_vector.h" #include "svm_musgrave.h" #include "svm_sky.h" #include "svm_tex_coord.h" @@ -236,7 +236,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade if(stack_load_float(stack, node.z) == 1.0f) offset += node.y; break; -#ifdef __IMAGE_TEXTURES__ +#ifdef __TEXTURES__ case NODE_TEX_IMAGE: svm_node_tex_image(kg, sd, stack, node); break; @@ -246,8 +246,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_TEX_ENVIRONMENT: svm_node_tex_environment(kg, sd, stack, node); break; -#endif -#ifdef __PROCEDURAL_TEXTURES__ case NODE_TEX_SKY: svm_node_tex_sky(kg, sd, stack, node, &offset); break; @@ -327,11 +325,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_MIX: svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset); break; - case NODE_SEPARATE_RGB: - svm_node_separate_rgb(sd, stack, node.y, node.z, node.w); + case NODE_SEPARATE_VECTOR: + svm_node_separate_vector(sd, stack, node.y, node.z, node.w); break; - case NODE_COMBINE_RGB: - svm_node_combine_rgb(sd, stack, node.y, node.z, node.w); + case NODE_COMBINE_VECTOR: + svm_node_combine_vector(sd, stack, node.y, node.z, node.w); break; case NODE_SEPARATE_HSV: svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); @@ -407,12 +405,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade break; case NODE_CLOSURE_SET_NORMAL: svm_node_set_normal(kg, sd, stack, node.y, node.z ); - break; -#endif - case NODE_EMISSION_SET_WEIGHT_TOTAL: - svm_node_emission_set_weight_total(kg, sd, node.y, node.z, node.w); break; -#ifdef __EXTRA_NODES__ case NODE_RGB_RAMP: svm_node_rgb_ramp(kg, sd, stack, node, &offset); break; @@ -425,17 +418,13 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_LIGHT_FALLOFF: svm_node_light_falloff(sd, stack, node); break; -#endif -#ifdef __ANISOTROPIC__ +#endif case NODE_TANGENT: svm_node_tangent(kg, sd, stack, node); break; -#endif -#ifdef __NORMAL_MAP__ case NODE_NORMAL_MAP: svm_node_normal_map(kg, sd, stack, node); - break; -#endif + break; case NODE_END: default: return; diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index 63dbf27d35e..1e40e868e14 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -42,7 +42,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta /* Input */ float temperature = stack_load_float(stack, temperature_offset); - if (temperature < BB_DRAPPER) { + if (temperature < BB_DRAPER) { /* just return very very dim red */ color_rgb = make_float3(1.0e-6f,0.0f,0.0f); } @@ -53,9 +53,9 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta /* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */ - float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER)); + float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER)); - int blackbody_table_offset = kernel_data.blackbody.table_offset; + int blackbody_table_offset = kernel_data.tables.blackbody_offset; /* Retrieve colors from the lookup table */ float lutval = t*lookuptablenormalize; diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h index 8d1a1a40449..e0408ad334a 100644 --- a/intern/cycles/kernel/svm/svm_checker.h +++ b/intern/cycles/kernel/svm/svm_checker.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline float svm_checker(float3 p) { /* avoid precision issues on unit coordinates */ - p.x = (p.x + 0.00001f)*0.9999f; - p.y = (p.y + 0.00001f)*0.9999f; - p.z = (p.z + 0.00001f)*0.9999f; + p.x = (p.x + 0.000001f)*0.999999f; + p.y = (p.y + 0.000001f)*0.999999f; + p.z = (p.z + 0.000001f)*0.999999f; int xi = float_to_int(fabsf(floorf(p.x))); int yi = float_to_int(fabsf(floorf(p.y))); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index a3770877544..30110db3ef9 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -24,6 +24,7 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type if(refract) { sc->data0 = eta; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_refraction_setup(sc); } else @@ -31,7 +32,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { sc->data0 = roughness; - sc->data1 = eta; + sc->data1 = roughness; + sc->data2 = eta; if(refract) sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); @@ -40,7 +42,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type } else { sc->data0 = roughness; - sc->data1 = eta; + sc->data1 = roughness; + sc->data2 = eta; if(refract) sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); @@ -135,11 +138,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(roughness == 0.0f) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_diffuse_setup(sc); } else { sc->data0 = roughness; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_oren_nayar_setup(sc); } } @@ -151,6 +156,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sc->N = N; sd->flag |= bsdf_translucent_setup(sc); } @@ -162,6 +168,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sc->N = N; sd->flag |= bsdf_transparent_setup(sc); } @@ -169,9 +176,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * } case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: - case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: { + case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); @@ -179,15 +187,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->N = N; sc->data0 = param1; - sc->data1 = 0.0f; + sc->data1 = param1; + sc->data2 = 0.0f; /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) sd->flag |= bsdf_reflection_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) sd->flag |= bsdf_microfacet_beckmann_setup(sc); - else + else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) sd->flag |= bsdf_microfacet_ggx_setup(sc); + else + sd->flag |= bsdf_ashikhmin_shirley_setup(sc); } break; @@ -196,25 +207,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { sc->N = N; - sc->data0 = param1; float eta = fmaxf(param2, 1e-5f); - sc->data1 = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ - if(type == CLOSURE_BSDF_REFRACTION_ID) + if(type == CLOSURE_BSDF_REFRACTION_ID) { + sc->data0 = eta; + sc->data1 = 0.0f; + sc->data2 = 0.0f; + sd->flag |= bsdf_refraction_setup(sc); - else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); - else - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + } + else { + sc->data0 = param1; + sc->data1 = param1; + sc->data2 = eta; + + if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + else + sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + } } break; @@ -223,8 +244,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && + !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) { break; + } #endif /* index of refraction */ float eta = fmaxf(param2, 1e-5f); @@ -241,12 +264,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float sample_weight = sc->sample_weight; sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel); - - if(sc) { - sc->N = N; - svm_node_glass_setup(sd, sc, type, eta, roughness, false); +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + if(sc) { + sc->N = N; + svm_node_glass_setup(sd, sc, type, eta, roughness, false); + } } +#ifdef __CAUSTICS_TRICKS__ + if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) + break; +#endif + /* refraction */ sc = &sd->closure[sd->num_closure]; sc->weight = weight; @@ -261,9 +293,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } - case CLOSURE_BSDF_WARD_ID: { + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); @@ -271,7 +305,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->N = N; -#ifdef __ANISOTROPIC__ sc->T = stack_load_float3(stack, data_node.y); /* rotate tangent */ @@ -293,10 +326,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = roughness/(1.0f - anisotropy); } - sd->flag |= bsdf_ward_setup(sc); -#else - sd->flag |= bsdf_diffuse_setup(sc); -#endif + sc->data2 = 0.0f; + + if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc); + else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) + sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc); + else + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc); } break; } @@ -309,6 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* sigma */ sc->data0 = clamp(param1, 0.0f, 1.0f); sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_ashikhmin_velvet_setup(sc); } break; @@ -322,6 +360,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; sc->data0 = param1; sc->data1 = param2; + sc->data2 = 0.0f; if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID) sd->flag |= bsdf_diffuse_toon_setup(sc); @@ -339,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { /* todo: giving a fixed weight here will cause issues when - * mixing multiple BSDFS. energey will not be conserved and + * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we * better figure out a way to skip backfaces from rays * spawned by transmission from the front */ @@ -356,11 +395,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; sc->data0 = param1; sc->data1 = param2; - sc->offset = -stack_load_float(stack, data_node.z); + sc->data2 = -stack_load_float(stack, data_node.z); if(!(sd->type & PRIMITIVE_ALL_CURVE)) { sc->T = normalize(sd->dPdv); - sc->offset = 0.0f; + sc->data2 = 0.0f; } else sc->T = sd->dPdu; @@ -405,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.x; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -421,6 +461,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.y; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -437,6 +478,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.z; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -582,16 +624,6 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint svm_node_closure_store_weight(sd, weight); } -ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b) -{ - float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b)); - - if(sd->object != OBJECT_NONE) - weight /= object_surface_area(kg, sd->object); - - svm_node_closure_store_weight(sd, weight); -} - ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset) { float3 weight = stack_load_float3(stack, weight_offset); @@ -603,14 +635,10 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, floa { uint color_offset = node.y; uint strength_offset = node.z; - uint total_power = node.w; float strength = stack_load_float(stack, strength_offset); float3 weight = stack_load_float3(stack, color_offset)*strength; - if(total_power && sd->object != OBJECT_NONE) - weight /= object_surface_area(kg, sd->object); - svm_node_closure_store_weight(sd, weight); } diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h index 2503912c5c6..b221e0728ec 100644 --- a/intern/cycles/kernel/svm/svm_convert.h +++ b/intern/cycles/kernel/svm/svm_convert.h @@ -45,13 +45,13 @@ ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint f } case NODE_CONVERT_VF: { float3 f = stack_load_float3(stack, from); - float g = (f.x + f.y + f.z)*(1.0f/3.0f); + float g = average(f); stack_store_float(stack, to, g); break; } case NODE_CONVERT_VI: { float3 f = stack_load_float3(stack, from); - int i = (int)((f.x + f.y + f.z)*(1.0f/3.0f)); + int i = (int)average(f); stack_store_int(stack, to, i); break; } diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index daf7c6652d2..8a256c9bda5 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -134,8 +134,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, { #ifdef __KERNEL_CPU__ #ifdef __KERNEL_SSE2__ - __m128 r_m128; - float4 &r = (float4 &)r_m128; + ssef r_ssef; + float4 &r = (float4 &)r_ssef; r = kernel_tex_image_interp(id, x, y); #else float4 r = kernel_tex_image_interp(id, x, y); @@ -252,9 +252,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break; case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break; case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; - case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break; case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break; case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break; @@ -318,14 +318,14 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_m128 = _mm_div_ps(r_m128, _mm_set1_ps(alpha)); + r_ssef = r_ssef / ssef(alpha); if(id >= TEX_NUM_FLOAT_IMAGES) - r_m128 = _mm_min_ps(r_m128, _mm_set1_ps(1.0f)); + r_ssef = min(r_ssef, ssef(1.0f)); r.w = alpha; } if(srgb) { - r_m128 = color_srgb_to_scene_linear(r_m128); + r_ssef = color_srgb_to_scene_linear(r_ssef); r.w = alpha; } #else @@ -435,6 +435,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f); } } + else { + /* Desperate mode, no valid choice anyway, fallback to one side.*/ + weight.x = 1.0f; + } /* now fetch textures */ uint co_offset, out_offset, alpha_offset, srgb; diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 91dda8972f9..c77c2a1c482 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -38,11 +38,11 @@ ccl_device int quick_floor(float x) return float_to_int(x) - ((x < 0) ? 1 : 0); } #else -ccl_device_inline __m128i quick_floor_sse(const __m128& x) +ccl_device_inline ssei quick_floor_sse(const ssef& x) { - __m128i b = _mm_cvttps_epi32(x); - __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f))); - return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1 + ssei b = truncatei(x); + ssei isneg = cast((x < ssef(0.0f)).m128); + return b + isneg; // unsaturated add 0xffffffff is the same as subtract -1 } #endif @@ -52,9 +52,9 @@ ccl_device float bits_to_01(uint bits) return bits * (1.0f/(float)0xFFFFFFFF); } #else -ccl_device_inline __m128 bits_to_01_sse(const __m128i& bits) +ccl_device_inline ssef bits_to_01_sse(const ssei& bits) { - return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF)); + return uint32_to_float(bits) * ssef(1.0f/(float)0xFFFFFFFF); } #endif @@ -88,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz) } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128i hash_sse(const __m128i& kx, const __m128i& ky, const __m128i& kz) +ccl_device_inline ssei hash_sse(const ssei& kx, const ssei& ky, const ssei& kz) { -#define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k))) -#define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0) +#define rot(x,k) (((x)<<(k)) | (srl(x, 32-(k)))) +#define xor_rot(a, b, c) do {a = a^b; a = a - rot(b, c);} while(0) uint len = 3; - __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13); - __m128i a = _mm_add_epi32(magic, kx); - __m128i b = _mm_add_epi32(magic, ky); - __m128i c = _mm_add_epi32(magic, kz); + ssei magic = ssei(0xdeadbeef + (len << 2) + 13); + ssei a = magic + kx; + ssei b = magic + ky; + ssei c = magic + kz; xor_rot(c, b, 14); xor_rot(a, c, 11); @@ -133,10 +133,10 @@ ccl_device float floorfrac(float x, int* i) return x - *i; } #else -ccl_device_inline __m128 floorfrac_sse(const __m128& x, __m128i *i) +ccl_device_inline ssef floorfrac_sse(const ssef& x, ssei *i) { *i = quick_floor_sse(x); - return _mm_sub_ps(x, _mm_cvtepi32_ps(*i)); + return x - ssef(*i); } #endif @@ -146,11 +146,11 @@ ccl_device float fade(float t) return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f); } #else -ccl_device_inline __m128 fade_sse(const __m128 *t) +ccl_device_inline ssef fade_sse(const ssef *t) { - __m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f)); - __m128 b = fma(*t, a, _mm_set1_ps(10.0f)); - return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b)); + ssef a = madd(*t, ssef(6.0f), ssef(-15.0f)); + ssef b = madd(*t, a, ssef(10.0f)); + return ((*t) * (*t)) * ((*t) * b); } #endif @@ -160,10 +160,10 @@ ccl_device float nerp(float t, float a, float b) return (1.0f - t) * a + t * b; } #else -ccl_device_inline __m128 nerp_sse(const __m128& t, const __m128& a, const __m128& b) +ccl_device_inline ssef nerp_sse(const ssef& t, const ssef& a, const ssef& b) { - __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a); - return fma(t, b, x1); + ssef x1 = (ssef(1.0f) - t) * a; + return madd(t, b, x1); } #endif @@ -178,35 +178,35 @@ ccl_device float grad(int hash, float x, float y, float z) return ((h&1) ? -u : u) + ((h&2) ? -v : v); } #else -ccl_device_inline __m128 grad_sse(const __m128i& hash, const __m128& x, const __m128& y, const __m128& z) +ccl_device_inline ssef grad_sse(const ssei& hash, const ssef& x, const ssef& y, const ssef& z) { - __m128i c1 = _mm_set1_epi32(1); - __m128i c2 = _mm_set1_epi32(2); + ssei c1 = ssei(1); + ssei c2 = ssei(2); - __m128i h = _mm_and_si128(hash, _mm_set1_epi32(15)); // h = hash & 15 + ssei h = hash & ssei(15); // h = hash & 15 - __m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8)); // 0xffffffff if h < 8 else 0 + sseb case_ux = h < ssei(8); // 0xffffffff if h < 8 else 0 - __m128 u = blend(_mm_castsi128_ps(case_ux), x, y); // u = h<8 ? x : y + ssef u = select(case_ux, x, y); // u = h<8 ? x : y - __m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4)); // 0xffffffff if h < 4 else 0 + sseb case_vy = h < ssei(4); // 0xffffffff if h < 4 else 0 - __m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12)); // 0xffffffff if h == 12 else 0 - __m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14)); // 0xffffffff if h == 14 else 0 + sseb case_h12 = h == ssei(12); // 0xffffffff if h == 12 else 0 + sseb case_h14 = h == ssei(14); // 0xffffffff if h == 14 else 0 - __m128i case_vx = _mm_or_si128(case_h12, case_h14); // 0xffffffff if h == 12 or h == 14 else 0 + sseb case_vx = case_h12 | case_h14; // 0xffffffff if h == 12 or h == 14 else 0 - __m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z + ssef v = select(case_vy, y, select(case_vx, x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z - __m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31); // 1<<31 if h&1 else 0 - __m128 case_uneg_mask = _mm_castsi128_ps(case_uneg); // -0.0 if h&1 else +0.0 - __m128 ru = _mm_xor_ps(u, case_uneg_mask); // -u if h&1 else u (copy float sign) + ssei case_uneg = (h & c1) << 31; // 1<<31 if h&1 else 0 + ssef case_uneg_mask = cast(case_uneg); // -0.0 if h&1 else +0.0 + ssef ru = u ^ case_uneg_mask; // -u if h&1 else u (copy float sign) - __m128i case_vneg = _mm_slli_epi32(_mm_and_si128(h, c2), 30); // 2<<30 if h&2 else 0 - __m128 case_vneg_mask = _mm_castsi128_ps(case_vneg); // -0.0 if h&2 else +0.0 - __m128 rv = _mm_xor_ps(v, case_vneg_mask); // -v if h&2 else v (copy float sign) + ssei case_vneg = (h & c2) << 30; // 2<<30 if h&2 else 0 + ssef case_vneg_mask = cast(case_vneg); // -0.0 if h&2 else +0.0 + ssef rv = v ^ case_vneg_mask; // -v if h&2 else v (copy float sign) - __m128 r = _mm_add_ps(ru, rv); // ((h&1) ? -u : u) + ((h&2) ? -v : v) + ssef r = ru + rv; // ((h&1) ? -u : u) + ((h&2) ? -v : v) return r; } #endif @@ -217,9 +217,9 @@ ccl_device float scale3(float result) return 0.9820f * result; } #else -ccl_device_inline __m128 scale3_sse(const __m128& result) +ccl_device_inline ssef scale3_sse(const ssef& result) { - return _mm_mul_ps(_mm_set1_ps(0.9820f), result); + return ssef(0.9820f) * result; } #endif @@ -252,75 +252,41 @@ ccl_device_noinline float perlin(float x, float y, float z) #else ccl_device_noinline float perlin(float x, float y, float z) { - __m128 xyz = _mm_setr_ps(x, y, z, 0.0f); - __m128i XYZ; + ssef xyz = ssef(x, y, z, 0.0f); + ssei XYZ; - __m128 fxyz = floorfrac_sse(xyz, &XYZ); + ssef fxyz = floorfrac_sse(xyz, &XYZ); - __m128 uvw = fade_sse(&fxyz); - __m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw); + ssef uvw = fade_sse(&fxyz); + ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw); - __m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1)); - __m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 - __m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 + ssei XYZ_ofc = XYZ + ssei(1); + ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 + ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 - __m128i h1 = hash_sse(broadcast<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 - __m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 + ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 + ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 - __m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f)); - __m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); - __m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); + ssef fxyz_ofc = fxyz - ssef(1.0f); + ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); + ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); - __m128 g1 = grad_sse(h1, broadcast<0>(fxyz), vfy, vfz); - __m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz); - __m128 n1 = nerp_sse(u, g1, g2); + ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz); + ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz); + ssef n1 = nerp_sse(u, g1, g2); - __m128 n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector - __m128 n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] + ssef n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector + ssef n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] - __m128 n2_second = broadcast<1>(n2); // extract b to a separate vector - __m128 result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] + ssef n2_second = shuffle<1>(n2); // extract b to a separate vector + ssef result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] - __m128 r = scale3_sse(result); + ssef r = scale3_sse(result); - __m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000)); - __m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0 - __m128 rfinite = _mm_andnot_ps(rinfmask, r); // 0 if r is inf/-inf/nan else r - return _mm_cvtss_f32(rfinite); -} -#endif - -#if 0 // unused -ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod) -{ - int X; float fx = floorfrac(x, &X); - int Y; float fy = floorfrac(y, &Y); - int Z; float fz = floorfrac(z, &Z); - - int3 p; - - p.x = max(quick_floor(pperiod.x), 1); - p.y = max(quick_floor(pperiod.y), 1); - p.z = max(quick_floor(pperiod.z), 1); - - float u = fade(fx); - float v = fade(fy); - float w = fade(fz); - - float result; - - result = nerp (w, nerp (v, nerp (u, grad (phash (X , Y , Z , p), fx , fy , fz ), - grad (phash (X+1, Y , Z , p), fx-1.0f, fy , fz )), - nerp (u, grad (phash (X , Y+1, Z , p), fx , fy-1.0f, fz ), - grad (phash (X+1, Y+1, Z , p), fx-1.0f, fy-1.0f, fz ))), - nerp (v, nerp (u, grad (phash (X , Y , Z+1, p), fx , fy , fz-1.0f ), - grad (phash (X+1, Y , Z+1, p), fx-1.0f, fy , fz-1.0f )), - nerp (u, grad (phash (X , Y+1, Z+1, p), fx , fy-1.0f, fz-1.0f ), - grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f )))); - float r = scale3(result); - - /* can happen for big coordinates, things even out to 0.0 then anyway */ - return (isfinite(r))? r: 0.0f; + ssef infmask = cast(ssei(0x7f800000)); + ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0 + ssef rfinite = andnot(rinfmask, r); // 0 if r is inf/-inf/nan else r + return extract<0>(rfinite); } #endif @@ -357,30 +323,15 @@ ccl_device float3 cellnoise_color(float3 p) return make_float3(r, g, b); } #else -ccl_device __m128 cellnoise_color(const __m128& p) +ccl_device ssef cellnoise_color(const ssef& p) { - __m128i ip = quick_floor_sse(p); - __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip); - __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip); - __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip); + ssei ip = quick_floor_sse(p); + ssei ip_yxz = shuffle<1, 0, 2, 3>(ip); + ssei ip_xyy = shuffle<0, 1, 1, 3>(ip); + ssei ip_zzx = shuffle<2, 2, 0, 3>(ip); return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx)); } #endif -#if 0 // unused -/* periodic perlin noise in range 0..1 */ -ccl_device float pnoise(float3 p, float3 pperiod) -{ - float r = perlin_periodic(p.x, p.y, p.z, pperiod); - return 0.5f*r + 0.5f; -} - -/* periodic perlin noise in range -1..1 */ -ccl_device float psnoise(float3 p, float3 pperiod) -{ - return perlin_periodic(p.x, p.y, p.z, pperiod); -} -#endif - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h deleted file mode 100644 index 34c4449ecdb..00000000000 --- a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset) -{ - float color = stack_load_float(stack, in_offset); - - if (stack_valid(out_offset)) - stack_store_float(stack, out_offset+color_index, color); -} - -ccl_device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset) -{ - float3 color = stack_load_float3(stack, icolor_offset); - - if (stack_valid(out_offset)) { - if (color_index == 0) - stack_store_float(stack, out_offset, color.x); - else if (color_index == 1) - stack_store_float(stack, out_offset, color.y); - else - stack_store_float(stack, out_offset, color.z); - } -} - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h new file mode 100644 index 00000000000..c8e7e34f87d --- /dev/null +++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h @@ -0,0 +1,44 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +/* Vector combine / separate, used for the RGB and XYZ nodes */ + +ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_offset, uint vector_index, uint out_offset) +{ + float vector = stack_load_float(stack, in_offset); + + if (stack_valid(out_offset)) + stack_store_float(stack, out_offset+vector_index, vector); +} + +ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivector_offset, uint vector_index, uint out_offset) +{ + float3 vector = stack_load_float3(stack, ivector_offset); + + if (stack_valid(out_offset)) { + if (vector_index == 0) + stack_store_float(stack, out_offset, vector.x); + else if (vector_index == 1) + stack_store_float(stack, out_offset, vector.y); + else + stack_store_float(stack, out_offset, vector.z); + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index 5fd9204cbf6..d97c85db36a 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -140,15 +140,15 @@ ccl_device float voronoi_F1_distance(float3 p) } } #else - __m128 vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); da = min(d, da); } } @@ -184,15 +184,15 @@ ccl_device float3 voronoi_F1_color(float3 p) return cellnoise_color(pa); #else - __m128 pa, vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef pa, vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); if(d < da) { da = d; @@ -202,7 +202,7 @@ ccl_device float3 voronoi_F1_color(float3 p) } } - __m128 color = cellnoise_color(pa); + ssef color = cellnoise_color(pa); return (float3 &)color; #endif } diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 80972ec82bc..fbe669c1fab 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -72,15 +72,14 @@ typedef enum NodeType { NODE_TEX_COORD, NODE_TEX_COORD_BUMP_DX, NODE_TEX_COORD_BUMP_DY, - NODE_EMISSION_SET_WEIGHT_TOTAL, NODE_ATTR_BUMP_DX, NODE_ATTR_BUMP_DY, NODE_TEX_ENVIRONMENT, NODE_CLOSURE_HOLDOUT, NODE_LAYER_WEIGHT, NODE_CLOSURE_VOLUME, - NODE_SEPARATE_RGB, - NODE_COMBINE_RGB, + NODE_SEPARATE_VECTOR, + NODE_COMBINE_VECTOR, NODE_SEPARATE_HSV, NODE_COMBINE_HSV, NODE_HSV, @@ -349,7 +348,6 @@ typedef enum ClosureType { /* Diffuse */ CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, - CLOSURE_BSDF_WESTIN_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, @@ -358,9 +356,11 @@ typedef enum ClosureType { CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, - CLOSURE_BSDF_WARD_ID, + CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, + CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, - CLOSURE_BSDF_WESTIN_BACKSCATTER_ID, CLOSURE_BSDF_PHONG_RAMP_ID, CLOSURE_BSDF_GLOSSY_TOON_ID, CLOSURE_BSDF_HAIR_REFLECTION_ID, @@ -404,7 +404,7 @@ typedef enum ClosureType { #define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) -#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type == CLOSURE_BSDF_WARD_ID) +#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_GAUSSIAN_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) |